diff options
author | Riccardo Spagni <ric@spagni.net> | 2018-11-04 20:46:41 +0200 |
---|---|---|
committer | Riccardo Spagni <ric@spagni.net> | 2018-11-04 20:46:42 +0200 |
commit | 6d3d8635bea2ca04937521a2b03215c7fe8e3262 (patch) | |
tree | ca455c8c2c75ca1dc533f43a7b30825fbc02c590 /src/ringct/multiexp.cc | |
parent | Merge pull request #4692 (diff) | |
parent | multiexp: some minor speedups (diff) | |
download | monero-6d3d8635bea2ca04937521a2b03215c7fe8e3262.tar.xz |
Merge pull request #4693
74fb3d88 multiexp: some minor speedups (moneromooo-monero)
a6d2e246 bulletproofs: only enable profiling on request (moneromooo-monero)
a110e6aa multiexp: tune which variants to use for which number of points (moneromooo-monero)
8b476722 bulletproofs: speedup prover (moneromooo-monero)
6f9ae5b6 multiexp: handle pippenger multiexps with part precalc (moneromooo-monero)
10e5a927 bulletproofs: maintain -z4, -z5, and -y0 to avoid subtractions (moneromooo-monero)
8629a42c bulletproofs: rework flow to use sarang's fast batch inversion code (moneromooo-monero)
fc9f7d9c bulletproofs: merge multiexps as per sarang's new python code (moneromooo-monero)
4061960a multiexp: pack the digits table when STRAUS_C is 4 (moneromooo-monero)
bf8e4b98 bulletproofs: some more minor speedup (moneromooo-monero)
c415df97 performance_tests: sc_check and ge_dsm_precomp (moneromooo-monero)
a281b950 bulletproofs: remove single value prover (moneromooo-monero)
484155d0 bulletproofs: some more speedup (moneromooo-monero)
a621d6c8 bulletproofs: random minor speedups (moneromooo-monero)
a49a1761 bulletproofs: shave off a lot of scalar muls from the g/h construction (moneromooo-monero)
4564a5d1 bulletproofs: speedup PROVE (moneromooo-monero)
Diffstat (limited to 'src/ringct/multiexp.cc')
-rw-r--r-- | src/ringct/multiexp.cc | 130 |
1 files changed, 88 insertions, 42 deletions
diff --git a/src/ringct/multiexp.cc b/src/ringct/multiexp.cc index 21957b94c..6f77fed34 100644 --- a/src/ringct/multiexp.cc +++ b/src/ringct/multiexp.cc @@ -79,6 +79,25 @@ extern "C" // Best/cached Straus Straus Straus Straus Straus Straus Straus Straus Pip Pip Pip Pip // Best/uncached Straus Straus Straus Straus Straus Straus Pip Pip Pip Pip Pip Pip +// New timings: +// Pippenger: +// 2/1 always +// 3/2 at ~13 +// 4/3 at ~29 +// 5/4 at ~83 +// 6/5 < 200 +// 7/6 at ~470 +// 8/7 at ~1180 +// 9/8 at ~2290 +// Cached Pippenger: +// 6/5 < 200 +// 7/6 at 460 +// 8/7 at 1180 +// 9/8 at 2300 +// +// Cached Straus/Pippenger cross at 232 +// + namespace rct { @@ -320,7 +339,7 @@ rct::key bos_coster_heap_conv_robust(std::vector<MultiexpData> data) return res; } -static constexpr unsigned int STRAUS_C = 4; +#define STRAUS_C 4 struct straus_cached_data { @@ -447,28 +466,26 @@ rct::key straus(const std::vector<MultiexpData> &data, const std::shared_ptr<str #endif MULTIEXP_PERF(PERF_TIMER_START_UNIT(digits, 1000000)); +#if STRAUS_C==4 + std::unique_ptr<uint8_t[]> digits{new uint8_t[64 * data.size()]}; +#else std::unique_ptr<uint8_t[]> digits{new uint8_t[256 * data.size()]}; +#endif for (size_t j = 0; j < data.size(); ++j) { - unsigned char bytes33[33]; - memcpy(bytes33, data[j].scalar.bytes, 32); - bytes33[32] = 0; - const unsigned char *bytes = bytes33; -#if 1 - static_assert(STRAUS_C == 4, "optimized version needs STRAUS_C == 4"); + const unsigned char *bytes = data[j].scalar.bytes; +#if STRAUS_C==4 unsigned int i; - for (i = 0; i < 256; i += 8, bytes++) + for (i = 0; i < 64; i += 2, bytes++) { - digits[j*256+i] = bytes[0] & 0xf; - digits[j*256+i+1] = (bytes[0] >> 1) & 0xf; - digits[j*256+i+2] = (bytes[0] >> 2) & 0xf; - digits[j*256+i+3] = (bytes[0] >> 3) & 0xf; - digits[j*256+i+4] = ((bytes[0] >> 4) | (bytes[1]<<4)) & 0xf; - digits[j*256+i+5] = ((bytes[0] >> 5) | (bytes[1]<<3)) & 0xf; - digits[j*256+i+6] = ((bytes[0] >> 6) | (bytes[1]<<2)) & 0xf; - digits[j*256+i+7] = ((bytes[0] >> 7) | (bytes[1]<<1)) & 0xf; + digits[j*64+i] = bytes[0] & 0xf; + digits[j*64+i+1] = bytes[0] >> 4; } #elif 1 + unsigned char bytes33[33]; + memcpy(bytes33, data[j].scalar.bytes, 32); + bytes33[32] = 0; + bytes = bytes33; for (size_t i = 0; i < 256; ++i) digits[j*256+i] = ((bytes[i>>3] | (bytes[(i>>3)+1]<<8)) >> (i&7)) & mask; #else @@ -521,7 +538,11 @@ skipfirst: if (skip[j]) continue; #endif +#if STRAUS_C==4 + const uint8_t digit = digits[j*64+i/4]; +#else const uint8_t digit = digits[j*256+i]; +#endif if (digit) { ge_add(&p1, &band_p3, &CACHE_OFFSET(local_cache, j, digit)); @@ -542,16 +563,13 @@ skipfirst: size_t get_pippenger_c(size_t N) { -// uncached: 2:1, 4:2, 8:2, 16:3, 32:4, 64:4, 128:5, 256:6, 512:7, 1024:7, 2048:8, 4096:9 -// cached: 2:1, 4:2, 8:2, 16:3, 32:4, 64:4, 128:5, 256:6, 512:7, 1024:7, 2048:8, 4096:9 - if (N <= 2) return 1; - if (N <= 8) return 2; - if (N <= 16) return 3; - if (N <= 64) return 4; - if (N <= 128) return 5; - if (N <= 256) return 6; - if (N <= 1024) return 7; - if (N <= 2048) return 8; + if (N <= 13) return 2; + if (N <= 29) return 3; + if (N <= 83) return 4; + if (N <= 185) return 5; + if (N <= 465) return 6; + if (N <= 1180) return 7; + if (N <= 2295) return 8; return 9; } @@ -563,12 +581,13 @@ struct pippenger_cached_data ~pippenger_cached_data() { aligned_free(cached); } }; -std::shared_ptr<pippenger_cached_data> pippenger_init_cache(const std::vector<MultiexpData> &data, size_t N) +std::shared_ptr<pippenger_cached_data> pippenger_init_cache(const std::vector<MultiexpData> &data, size_t start_offset, size_t N) { MULTIEXP_PERF(PERF_TIMER_START_UNIT(pippenger_init_cache, 1000000)); + CHECK_AND_ASSERT_THROW_MES(start_offset <= data.size(), "Bad cache base data"); if (N == 0) - N = data.size(); - CHECK_AND_ASSERT_THROW_MES(N <= data.size(), "Bad cache base data"); + N = data.size() - start_offset; + CHECK_AND_ASSERT_THROW_MES(N <= data.size() - start_offset, "Bad cache base data"); ge_cached cached; std::shared_ptr<pippenger_cached_data> cache(new pippenger_cached_data()); @@ -576,7 +595,7 @@ std::shared_ptr<pippenger_cached_data> pippenger_init_cache(const std::vector<Mu cache->cached = (ge_cached*)aligned_realloc(cache->cached, N * sizeof(ge_cached), 4096); CHECK_AND_ASSERT_THROW_MES(cache->cached, "Out of memory"); for (size_t i = 0; i < N; ++i) - ge_p3_to_cached(&cache->cached[i], &data[i].point); + ge_p3_to_cached(&cache->cached[i], &data[i+start_offset].point); MULTIEXP_PERF(PERF_TIMER_STOP(pippenger_init_cache)); return cache; @@ -587,16 +606,21 @@ size_t pippenger_get_cache_size(const std::shared_ptr<pippenger_cached_data> &ca return cache->size * sizeof(*cache->cached); } -rct::key pippenger(const std::vector<MultiexpData> &data, const std::shared_ptr<pippenger_cached_data> &cache, size_t c) +rct::key pippenger(const std::vector<MultiexpData> &data, const std::shared_ptr<pippenger_cached_data> &cache, size_t cache_size, size_t c) { - CHECK_AND_ASSERT_THROW_MES(cache == NULL || cache->size >= data.size(), "Cache is too small"); + if (cache != NULL && cache_size == 0) + cache_size = cache->size; + CHECK_AND_ASSERT_THROW_MES(cache == NULL || cache_size <= cache->size, "Cache is too small"); if (c == 0) c = get_pippenger_c(data.size()); CHECK_AND_ASSERT_THROW_MES(c <= 9, "c is too large"); ge_p3 result = ge_p3_identity; + bool result_init = false; std::unique_ptr<ge_p3[]> buckets{new ge_p3[1<<c]}; + bool buckets_init[1<<9]; std::shared_ptr<pippenger_cached_data> local_cache = cache == NULL ? pippenger_init_cache(data) : cache; + std::shared_ptr<pippenger_cached_data> local_cache_2 = data.size() > cache_size ? pippenger_init_cache(data, cache_size) : NULL; rct::key maxscalar = rct::zero(); for (size_t i = 0; i < data.size(); ++i) @@ -611,7 +635,7 @@ rct::key pippenger(const std::vector<MultiexpData> &data, const std::shared_ptr< for (size_t k = groups; k-- > 0; ) { - if (!ge_p3_is_point_at_infinity(&result)) + if (result_init) { ge_p2 p2; ge_p3_to_p2(&p2, &result); @@ -625,8 +649,7 @@ rct::key pippenger(const std::vector<MultiexpData> &data, const std::shared_ptr< ge_p1p1_to_p2(&p2, &p1); } } - for (size_t i = 0; i < (1u<<c); ++i) - buckets[i] = ge_p3_identity; + memset(buckets_init, 0, 1u<<c); // partition scalars into buckets for (size_t i = 0; i < data.size(); ++i) @@ -638,22 +661,45 @@ rct::key pippenger(const std::vector<MultiexpData> &data, const std::shared_ptr< if (bucket == 0) continue; CHECK_AND_ASSERT_THROW_MES(bucket < (1u<<c), "bucket overflow"); - if (!ge_p3_is_point_at_infinity(&buckets[bucket])) + if (buckets_init[bucket]) { - add(buckets[bucket], local_cache->cached[i]); + if (i < cache_size) + add(buckets[bucket], local_cache->cached[i]); + else + add(buckets[bucket], local_cache_2->cached[i - cache_size]); } else + { buckets[bucket] = data[i].point; + buckets_init[bucket] = true; + } } // sum the buckets - ge_p3 pail = ge_p3_identity; + ge_p3 pail; + bool pail_init = false; for (size_t i = (1<<c)-1; i > 0; --i) { - if (!ge_p3_is_point_at_infinity(&buckets[i])) - add(pail, buckets[i]); - if (!ge_p3_is_point_at_infinity(&pail)) - add(result, pail); + if (buckets_init[i]) + { + if (pail_init) + add(pail, buckets[i]); + else + { + pail = buckets[i]; + pail_init = true; + } + } + if (pail_init) + { + if (result_init) + add(result, pail); + else + { + result = pail; + result_init = true; + } + } } } |