From f92903cc5338daed898242f22015d8426c065770 Mon Sep 17 00:00:00 2001 From: Roman Lebedev Date: Mon, 13 May 2019 12:33:11 +0300 Subject: [PATCH] Iteration counts should be `uint64_t` globally. (#817) This is a shameless rip-off of https://github.com/google/benchmark/pull/646 I did promise to look into why that proposed PR was producing so much worse assembly, and so i finally did. The reason is - that diff changes `size_t` (unsigned) to `int64_t` (signed). There is this nice little `assert`: https://github.com/google/benchmark/blob/7a1c37028359ca9d386d719a6ad527743cf1b753/include/benchmark/benchmark.h#L744 It ensures that we didn't magically decide to advance our iterator when we should have finished benchmarking. When `cached_` was unsigned, the `assert` was `cached_ UGT 0`. But we only ever get to that `assert` if `cached_ NE 0`, and naturally if `cached_` is not `0`, then it is bigger than `0`, so the `assert` is tautological, and gets folded away. But now that `cached_` became signed, the assert became `cached_ SGT 0`. And we still only know that `cached_ NE 0`, so the assert can't be optimized out, or at least it doesn't currently. Regardless of whether or not that is a bug in itself, that particular diff would have regressed the normal 64-bit systems, by halving the maximal iteration space (since we go from unsigned counter to signed one, of the same bit-width), which seems like a bug. And just so it happens, fixing *this* bug, fixes the other bug. This produces fully (bit-by-bit) identical state_assembly_test.s The filecheck change is actually needed regardless of this patch, else this test does not pass for me even without this diff. --- include/benchmark/benchmark.h | 38 +++++++++++++++++------------------ src/benchmark.cc | 4 ++-- src/benchmark_api_internal.cc | 6 +++--- src/benchmark_api_internal.h | 4 ++-- src/benchmark_register.cc | 2 +- src/benchmark_runner.cc | 23 +++++++++++---------- src/complexity.cc | 15 ++++++++------ src/counter.cc | 5 +++-- src/counter.h | 3 ++- src/json_reporter.cc | 6 ++++++ src/thread_manager.h | 2 +- test/basic_test.cc | 8 ++++---- test/complexity_test.cc | 12 ++++++----- test/cxx03_test.cc | 2 +- test/state_assembly_test.cc | 2 +- 15 files changed, 73 insertions(+), 59 deletions(-) diff --git a/include/benchmark/benchmark.h b/include/benchmark/benchmark.h index 83293685..6cb96f54 100644 --- a/include/benchmark/benchmark.h +++ b/include/benchmark/benchmark.h @@ -56,8 +56,7 @@ static void BM_memcpy(benchmark::State& state) { memset(src, 'x', state.range(0)); for (auto _ : state) memcpy(dst, src, state.range(0)); - state.SetBytesProcessed(int64_t(state.iterations()) * - int64_t(state.range(0))); + state.SetBytesProcessed(state.iterations() * state.range(0)); delete[] src; delete[] dst; } BENCHMARK(BM_memcpy)->Arg(8)->Arg(64)->Arg(512)->Arg(1<<10)->Arg(8<<10); @@ -122,8 +121,7 @@ template int BM_Sequential(benchmark::State& state) { q.Wait(&v); } // actually messages, not bytes: - state.SetBytesProcessed( - static_cast(state.iterations())*state.range(0)); + state.SetBytesProcessed(state.iterations() * state.range(0)); } BENCHMARK_TEMPLATE(BM_Sequential, WaitQueue)->Range(1<<0, 1<<10); @@ -413,9 +411,11 @@ enum TimeUnit { kNanosecond, kMicrosecond, kMillisecond }; // calculated automatically to the best fit. enum BigO { oNone, o1, oN, oNSquared, oNCubed, oLogN, oNLogN, oAuto, oLambda }; +typedef uint64_t IterationCount; + // BigOFunc is passed to a benchmark in order to specify the asymptotic // computational complexity for the benchmark. -typedef double(BigOFunc)(int64_t); +typedef double(BigOFunc)(IterationCount); // StatisticsFunc is passed to a benchmark in order to compute some descriptive // statistics over all the measurements of some type @@ -488,7 +488,7 @@ class State { // while (state.KeepRunningBatch(1000)) { // // process 1000 elements // } - bool KeepRunningBatch(size_t n); + bool KeepRunningBatch(IterationCount n); // REQUIRES: timer is running and 'SkipWithError(...)' has not been called // by the current thread. @@ -627,7 +627,7 @@ class State { int64_t range_y() const { return range(1); } BENCHMARK_ALWAYS_INLINE - size_t iterations() const { + IterationCount iterations() const { if (BENCHMARK_BUILTIN_EXPECT(!started_, false)) { return 0; } @@ -638,15 +638,15 @@ class State { : // items we expect on the first cache line (ie 64 bytes of the struct) // When total_iterations_ is 0, KeepRunning() and friends will return false. // May be larger than max_iterations. - size_t total_iterations_; + IterationCount total_iterations_; // When using KeepRunningBatch(), batch_leftover_ holds the number of // iterations beyond max_iters that were run. Used to track // completed_iterations_ accurately. - size_t batch_leftover_; + IterationCount batch_leftover_; public: - const size_t max_iterations; + const IterationCount max_iterations; private: bool started_; @@ -667,14 +667,14 @@ class State { const int threads; private: - State(size_t max_iters, const std::vector& ranges, int thread_i, - int n_threads, internal::ThreadTimer* timer, + State(IterationCount max_iters, const std::vector& ranges, + int thread_i, int n_threads, internal::ThreadTimer* timer, internal::ThreadManager* manager); void StartKeepRunning(); // Implementation of KeepRunning() and KeepRunningBatch(). // is_batch must be true unless n is 1. - bool KeepRunningInternal(size_t n, bool is_batch); + bool KeepRunningInternal(IterationCount n, bool is_batch); void FinishKeepRunning(); internal::ThreadTimer* timer_; internal::ThreadManager* manager_; @@ -686,11 +686,11 @@ inline BENCHMARK_ALWAYS_INLINE bool State::KeepRunning() { return KeepRunningInternal(1, /*is_batch=*/false); } -inline BENCHMARK_ALWAYS_INLINE bool State::KeepRunningBatch(size_t n) { +inline BENCHMARK_ALWAYS_INLINE bool State::KeepRunningBatch(IterationCount n) { return KeepRunningInternal(n, /*is_batch=*/true); } -inline BENCHMARK_ALWAYS_INLINE bool State::KeepRunningInternal(size_t n, +inline BENCHMARK_ALWAYS_INLINE bool State::KeepRunningInternal(IterationCount n, bool is_batch) { // total_iterations_ is set to 0 by the constructor, and always set to a // nonzero value by StartKepRunning(). @@ -754,7 +754,7 @@ struct State::StateIterator { } private: - size_t cached_; + IterationCount cached_; State* const parent_; }; @@ -858,7 +858,7 @@ class Benchmark { // NOTE: This function should only be used when *exact* iteration control is // needed and never to control or limit how long a benchmark runs, where // `--benchmark_min_time=N` or `MinTime(...)` should be used instead. - Benchmark* Iterations(size_t n); + Benchmark* Iterations(IterationCount n); // Specify the amount of times to repeat this benchmark. This option overrides // the `benchmark_repetitions` flag. @@ -957,7 +957,7 @@ class Benchmark { TimeUnit time_unit_; int range_multiplier_; double min_time_; - size_t iterations_; + IterationCount iterations_; int repetitions_; bool measure_process_cpu_time_; bool use_real_time_; @@ -1375,7 +1375,7 @@ class BenchmarkReporter { bool error_occurred; std::string error_message; - int64_t iterations; + IterationCount iterations; int64_t threads; int64_t repetition_index; int64_t repetitions; diff --git a/src/benchmark.cc b/src/benchmark.cc index c76346c6..29bfa351 100644 --- a/src/benchmark.cc +++ b/src/benchmark.cc @@ -121,8 +121,8 @@ void UseCharPointer(char const volatile*) {} } // namespace internal -State::State(size_t max_iters, const std::vector& ranges, int thread_i, - int n_threads, internal::ThreadTimer* timer, +State::State(IterationCount max_iters, const std::vector& ranges, + int thread_i, int n_threads, internal::ThreadTimer* timer, internal::ThreadManager* manager) : total_iterations_(0), batch_leftover_(0), diff --git a/src/benchmark_api_internal.cc b/src/benchmark_api_internal.cc index 8d310836..d468a257 100644 --- a/src/benchmark_api_internal.cc +++ b/src/benchmark_api_internal.cc @@ -3,9 +3,9 @@ namespace benchmark { namespace internal { -State BenchmarkInstance::Run( - size_t iters, int thread_id, internal::ThreadTimer* timer, - internal::ThreadManager* manager) const { +State BenchmarkInstance::Run(IterationCount iters, int thread_id, + internal::ThreadTimer* timer, + internal::ThreadManager* manager) const { State st(iters, arg, thread_id, threads, timer, manager); benchmark->Run(st); return st; diff --git a/src/benchmark_api_internal.h b/src/benchmark_api_internal.h index 19c6d4f7..264eff95 100644 --- a/src/benchmark_api_internal.h +++ b/src/benchmark_api_internal.h @@ -32,10 +32,10 @@ struct BenchmarkInstance { bool last_benchmark_instance; int repetitions; double min_time; - size_t iterations; + IterationCount iterations; int threads; // Number of concurrent threads to us - State Run(size_t iters, int thread_id, internal::ThreadTimer* timer, + State Run(IterationCount iters, int thread_id, internal::ThreadTimer* timer, internal::ThreadManager* manager) const; }; diff --git a/src/benchmark_register.cc b/src/benchmark_register.cc index 3ffd7345..6696c382 100644 --- a/src/benchmark_register.cc +++ b/src/benchmark_register.cc @@ -376,7 +376,7 @@ Benchmark* Benchmark::MinTime(double t) { return this; } -Benchmark* Benchmark::Iterations(size_t n) { +Benchmark* Benchmark::Iterations(IterationCount n) { CHECK(n > 0); CHECK(IsZero(min_time_)); iterations_ = n; diff --git a/src/benchmark_runner.cc b/src/benchmark_runner.cc index b1c0b889..0bae6a54 100644 --- a/src/benchmark_runner.cc +++ b/src/benchmark_runner.cc @@ -59,11 +59,12 @@ MemoryManager* memory_manager = nullptr; namespace { -static const size_t kMaxIterations = 1000000000; +static constexpr IterationCount kMaxIterations = 1000000000; BenchmarkReporter::Run CreateRunReport( const benchmark::internal::BenchmarkInstance& b, - const internal::ThreadManager::Result& results, size_t memory_iterations, + const internal::ThreadManager::Result& results, + IterationCount memory_iterations, const MemoryManager::Result& memory_result, double seconds, int64_t repetition_index) { // Create report about this benchmark run. @@ -109,8 +110,8 @@ BenchmarkReporter::Run CreateRunReport( // Execute one thread of benchmark b for the specified number of iterations. // Adds the stats collected for the thread into *total. -void RunInThread(const BenchmarkInstance* b, size_t iters, int thread_id, - ThreadManager* manager) { +void RunInThread(const BenchmarkInstance* b, IterationCount iters, + int thread_id, ThreadManager* manager) { internal::ThreadTimer timer( b->measure_process_cpu_time ? internal::ThreadTimer::CreateProcessCpuTime() @@ -187,13 +188,13 @@ class BenchmarkRunner { std::vector pool; - size_t iters; // preserved between repetitions! + IterationCount iters; // preserved between repetitions! // So only the first repetition has to find/calculate it, // the other repetitions will just use that precomputed iteration count. struct IterationResults { internal::ThreadManager::Result results; - size_t iters; + IterationCount iters; double seconds; }; IterationResults DoNIterations() { @@ -248,7 +249,7 @@ class BenchmarkRunner { return i; } - size_t PredictNumItersNeeded(const IterationResults& i) const { + IterationCount PredictNumItersNeeded(const IterationResults& i) const { // See how much iterations should be increased by. // Note: Avoid division by zero with max(seconds, 1ns). double multiplier = min_time * 1.4 / std::max(i.seconds, 1e-9); @@ -262,10 +263,10 @@ class BenchmarkRunner { if (multiplier <= 1.0) multiplier = 2.0; // So what seems to be the sufficiently-large iteration count? Round up. - const size_t max_next_iters = + const IterationCount max_next_iters = 0.5 + std::max(multiplier * i.iters, i.iters + 1.0); // But we do have *some* sanity limits though.. - const size_t next_iters = std::min(max_next_iters, kMaxIterations); + const IterationCount next_iters = std::min(max_next_iters, kMaxIterations); VLOG(3) << "Next iters: " << next_iters << ", " << multiplier << "\n"; return next_iters; // round up before conversion to integer. @@ -319,11 +320,11 @@ class BenchmarkRunner { // Oh, one last thing, we need to also produce the 'memory measurements'.. MemoryManager::Result memory_result; - size_t memory_iterations = 0; + IterationCount memory_iterations = 0; if (memory_manager != nullptr) { // Only run a few iterations to reduce the impact of one-time // allocations in benchmarks that are not properly managed. - memory_iterations = std::min(16, iters); + memory_iterations = std::min(16, iters); memory_manager->Start(); std::unique_ptr manager; manager.reset(new internal::ThreadManager(1)); diff --git a/src/complexity.cc b/src/complexity.cc index e65bd2eb..aeed67f0 100644 --- a/src/complexity.cc +++ b/src/complexity.cc @@ -29,20 +29,23 @@ BigOFunc* FittingCurve(BigO complexity) { static const double kLog2E = 1.44269504088896340736; switch (complexity) { case oN: - return [](int64_t n) -> double { return static_cast(n); }; + return [](IterationCount n) -> double { return static_cast(n); }; case oNSquared: - return [](int64_t n) -> double { return std::pow(n, 2); }; + return [](IterationCount n) -> double { return std::pow(n, 2); }; case oNCubed: - return [](int64_t n) -> double { return std::pow(n, 3); }; + return [](IterationCount n) -> double { return std::pow(n, 3); }; case oLogN: /* Note: can't use log2 because Android's GNU STL lacks it */ - return [](int64_t n) { return kLog2E * log(static_cast(n)); }; + return + [](IterationCount n) { return kLog2E * log(static_cast(n)); }; case oNLogN: /* Note: can't use log2 because Android's GNU STL lacks it */ - return [](int64_t n) { return kLog2E * n * log(static_cast(n)); }; + return [](IterationCount n) { + return kLog2E * n * log(static_cast(n)); + }; case o1: default: - return [](int64_t) { return 1.0; }; + return [](IterationCount) { return 1.0; }; } } diff --git a/src/counter.cc b/src/counter.cc index cb604e06..c248ea11 100644 --- a/src/counter.cc +++ b/src/counter.cc @@ -17,7 +17,7 @@ namespace benchmark { namespace internal { -double Finish(Counter const& c, int64_t iterations, double cpu_time, +double Finish(Counter const& c, IterationCount iterations, double cpu_time, double num_threads) { double v = c.value; if (c.flags & Counter::kIsRate) { @@ -35,7 +35,8 @@ double Finish(Counter const& c, int64_t iterations, double cpu_time, return v; } -void Finish(UserCounters* l, int64_t iterations, double cpu_time, double num_threads) { +void Finish(UserCounters* l, IterationCount iterations, double cpu_time, + double num_threads) { for (auto& c : *l) { c.second.value = Finish(c.second, iterations, cpu_time, num_threads); } diff --git a/src/counter.h b/src/counter.h index d884e50a..1ad46d49 100644 --- a/src/counter.h +++ b/src/counter.h @@ -18,7 +18,8 @@ namespace benchmark { // these counter-related functions are hidden to reduce API surface. namespace internal { -void Finish(UserCounters* l, int64_t iterations, double time, double num_threads); +void Finish(UserCounters* l, IterationCount iterations, double time, + double num_threads); void Increment(UserCounters* l, UserCounters const& r); bool SameNames(UserCounters const& l, UserCounters const& r); } // end namespace internal diff --git a/src/json_reporter.cc b/src/json_reporter.cc index cf1de254..11db2b99 100644 --- a/src/json_reporter.cc +++ b/src/json_reporter.cc @@ -68,6 +68,12 @@ std::string FormatKV(std::string const& key, int64_t value) { return ss.str(); } +std::string FormatKV(std::string const& key, IterationCount value) { + std::stringstream ss; + ss << '"' << StrEscape(key) << "\": " << value; + return ss.str(); +} + std::string FormatKV(std::string const& key, double value) { std::stringstream ss; ss << '"' << StrEscape(key) << "\": "; diff --git a/src/thread_manager.h b/src/thread_manager.h index 6e274c7e..1720281f 100644 --- a/src/thread_manager.h +++ b/src/thread_manager.h @@ -38,7 +38,7 @@ class ThreadManager { public: struct Result { - int64_t iterations = 0; + IterationCount iterations = 0; double real_time_used = 0; double cpu_time_used = 0; double manual_time_used = 0; diff --git a/test/basic_test.cc b/test/basic_test.cc index d07fbc00..5f3dd1a3 100644 --- a/test/basic_test.cc +++ b/test/basic_test.cc @@ -98,7 +98,7 @@ BENCHMARK(BM_empty_stop_start)->ThreadPerCpu(); void BM_KeepRunning(benchmark::State& state) { - size_t iter_count = 0; + benchmark::IterationCount iter_count = 0; assert(iter_count == state.iterations()); while (state.KeepRunning()) { ++iter_count; @@ -109,8 +109,8 @@ BENCHMARK(BM_KeepRunning); void BM_KeepRunningBatch(benchmark::State& state) { // Choose a prime batch size to avoid evenly dividing max_iterations. - const size_t batch_size = 101; - size_t iter_count = 0; + const benchmark::IterationCount batch_size = 101; + benchmark::IterationCount iter_count = 0; while (state.KeepRunningBatch(batch_size)) { iter_count += batch_size; } @@ -119,7 +119,7 @@ void BM_KeepRunningBatch(benchmark::State& state) { BENCHMARK(BM_KeepRunningBatch); void BM_RangedFor(benchmark::State& state) { - size_t iter_count = 0; + benchmark::IterationCount iter_count = 0; for (auto _ : state) { ++iter_count; } diff --git a/test/complexity_test.cc b/test/complexity_test.cc index 4a628692..d4febbbc 100644 --- a/test/complexity_test.cc +++ b/test/complexity_test.cc @@ -66,9 +66,9 @@ void BM_Complexity_O1(benchmark::State& state) { } BENCHMARK(BM_Complexity_O1)->Range(1, 1 << 18)->Complexity(benchmark::o1); BENCHMARK(BM_Complexity_O1)->Range(1, 1 << 18)->Complexity(); -BENCHMARK(BM_Complexity_O1)->Range(1, 1 << 18)->Complexity([](int64_t) { - return 1.0; -}); +BENCHMARK(BM_Complexity_O1) + ->Range(1, 1 << 18) + ->Complexity([](benchmark::IterationCount) { return 1.0; }); const char *one_test_name = "BM_Complexity_O1"; const char *big_o_1_test_name = "BM_Complexity_O1_BigO"; @@ -121,7 +121,9 @@ BENCHMARK(BM_Complexity_O_N) BENCHMARK(BM_Complexity_O_N) ->RangeMultiplier(2) ->Range(1 << 10, 1 << 16) - ->Complexity([](int64_t n) -> double { return static_cast(n); }); + ->Complexity([](benchmark::IterationCount n) -> double { + return static_cast(n); + }); BENCHMARK(BM_Complexity_O_N) ->RangeMultiplier(2) ->Range(1 << 10, 1 << 16) @@ -160,7 +162,7 @@ BENCHMARK(BM_Complexity_O_N_log_N) BENCHMARK(BM_Complexity_O_N_log_N) ->RangeMultiplier(2) ->Range(1 << 10, 1 << 16) - ->Complexity([](int64_t n) { + ->Complexity([](benchmark::IterationCount n) { return kLog2E * n * log(static_cast(n)); }); BENCHMARK(BM_Complexity_O_N_log_N) diff --git a/test/cxx03_test.cc b/test/cxx03_test.cc index baa9ed92..c4c9a522 100644 --- a/test/cxx03_test.cc +++ b/test/cxx03_test.cc @@ -14,7 +14,7 @@ void BM_empty(benchmark::State& state) { while (state.KeepRunning()) { - volatile std::size_t x = state.iterations(); + volatile benchmark::IterationCount x = state.iterations(); ((void)x); } } diff --git a/test/state_assembly_test.cc b/test/state_assembly_test.cc index abe9a4dd..7ddbb3b2 100644 --- a/test/state_assembly_test.cc +++ b/test/state_assembly_test.cc @@ -25,7 +25,7 @@ extern "C" int test_for_auto_loop() { for (auto _ : S) { // CHECK: .L[[LOOP_HEAD:[a-zA-Z0-9_]+]]: // CHECK-GNU-NEXT: subq $1, %rbx - // CHECK-CLANG-NEXT: {{(addq \$1,|incq)}} %rax + // CHECK-CLANG-NEXT: {{(addq \$1, %rax|incq %rax|addq \$-1, %rbx)}} // CHECK-NEXT: jne .L[[LOOP_HEAD]] benchmark::DoNotOptimize(x); }