Random interleaving of benchmark repetitions - the sequel (fixes #1051) (#1163)

Inspired by the original implementation by Hai Huang @haih-g from https://github.com/google/benchmark/pull/1105. The original implementation had design deficiencies that weren't really addressable without redesign, so it was reverted. In essence, the original implementation consisted of two separateable parts: * reducing the amount time each repetition is run for, and symmetrically increasing repetition count * running the repetitions in random order While it worked fine for the usual case, it broke down when user would specify repetitions (it would completely ignore that request), or specified per-repetition min time (while it would still adjust the repetition count, it would not adjust the per-repetition time, leading to much greater run times) Here, like i was originally suggesting in the original review, i'm separating the features, and only dealing with a single one - running repetitions in random order. Now that the runs/repetitions are no longer in-order, the tooling may wish to sort the output, and indeed `compare.py` has been updated to do that: #1168.
2021-06-03 21:16:54 +03:00 · 2021-06-03 21:16:54 +03:00 · fbc31405b2
parent d17ea66551
commit fbc31405b2
10 changed files with 256 additions and 50 deletions
--- a/README.md
+++ b/README.md
@ -299,6 +299,8 @@ too (`-lkstat`).

 [Setting the Time Unit](#setting-the-time-unit)

+[Random Interleaving](docs/random_interleaving.md)
+
 [User-Requested Performance Counters](docs/perf_counters.md)

 [Preventing Optimization](#preventing-optimization)
--- a/docs/random_interleaving.md
+++ b/docs/random_interleaving.md
@ -0,0 +1,13 @@
+<a name="interleaving" />
+
+# Random Interleaving
+
+[Random Interleaving](https://github.com/google/benchmark/issues/1051) is a
+technique to lower run-to-run variance. It randomly interleaves repetitions of a
+microbenchmark with repetitions from other microbenchmarks in the same benchmark
+test. Data shows it is able to lower run-to-run variance by
+[40%](https://github.com/google/benchmark/issues/1051) on average.
+
+To use, you mainly need to set `--benchmark_enable_random_interleaving=true`,
+and optionally specify non-zero repetition count `--benchmark_repetitions=9`
+and optionally decrease the per-repetition time `--benchmark_min_time=0.1`.
--- a/include/benchmark/benchmark.h
+++ b/include/benchmark/benchmark.h
@ -1472,6 +1472,19 @@ class BenchmarkReporter {
    int64_t max_bytes_used;
  };

+  struct PerFamilyRunReports {
+    PerFamilyRunReports() : num_runs_total(0), num_runs_done(0) {}
+
+    // How many runs will all instances of this benchmark perform?
+    int num_runs_total;
+
+    // How many runs have happened already?
+    int num_runs_done;
+
+    // The reports about (non-errneous!) runs of this family.
+    std::vector<BenchmarkReporter::Run> Runs;
+  };
+
  // Construct a BenchmarkReporter with the output stream set to 'std::cout'
  // and the error stream set to 'std::cerr'
  BenchmarkReporter();
--- a/src/benchmark.cc
+++ b/src/benchmark.cc
@ -33,8 +33,10 @@
 #include <cstdlib>
 #include <fstream>
 #include <iostream>
+#include <limits>
 #include <map>
 #include <memory>
+#include <random>
 #include <string>
 #include <thread>
 #include <utility>
@ -73,6 +75,10 @@ DEFINE_double(benchmark_min_time, 0.5);
 // standard deviation of the runs will be reported.
 DEFINE_int32(benchmark_repetitions, 1);

+// If set, enable random interleaving of repetitions of all benchmarks.
+// See http://github.com/google/benchmark/issues/1051 for details.
+DEFINE_bool(benchmark_enable_random_interleaving, false);
+
 // Report the result of each benchmark repetitions. When 'true' is specified
 // only the mean, standard deviation, and other statistics are reported for
 // repeated benchmarks. Affects all reporters.
@ -297,23 +303,68 @@ void RunBenchmarks(const std::vector<BenchmarkInstance>& benchmarks,
  context.name_field_width = name_field_width;

  // Keep track of running times of all instances of each benchmark family.
-  std::map<int /*family_index*/, std::vector<BenchmarkReporter::Run>>
-      complexity_reports;
+  std::map<int /*family_index*/, BenchmarkReporter::PerFamilyRunReports>
+      per_family_reports;

  if (display_reporter->ReportContext(context) &&
      (!file_reporter || file_reporter->ReportContext(context))) {
    FlushStreams(display_reporter);
    FlushStreams(file_reporter);

-    for (const BenchmarkInstance& benchmark : benchmarks) {
-      std::vector<BenchmarkReporter::Run>* complexity_reports_for_family =
-          nullptr;
-      if (benchmark.complexity() != oNone)
-        complexity_reports_for_family =
-            &complexity_reports[benchmark.family_index()];
+    size_t num_repetitions_total = 0;

-      RunResults run_results =
-          RunBenchmark(benchmark, complexity_reports_for_family);
+    std::vector<internal::BenchmarkRunner> runners;
+    runners.reserve(benchmarks.size());
+    for (const BenchmarkInstance& benchmark : benchmarks) {
+      BenchmarkReporter::PerFamilyRunReports* reports_for_family = nullptr;
+      if (benchmark.complexity() != oNone)
+        reports_for_family = &per_family_reports[benchmark.family_index()];
+
+      runners.emplace_back(benchmark, reports_for_family);
+      int num_repeats_of_this_instance = runners.back().GetNumRepeats();
+      num_repetitions_total += num_repeats_of_this_instance;
+      if (reports_for_family)
+        reports_for_family->num_runs_total += num_repeats_of_this_instance;
+    }
+    assert(runners.size() == benchmarks.size() && "Unexpected runner count.");
+
+    std::vector<int> repetition_indices;
+    repetition_indices.reserve(num_repetitions_total);
+    for (size_t runner_index = 0, num_runners = runners.size();
+         runner_index != num_runners; ++runner_index) {
+      const internal::BenchmarkRunner& runner = runners[runner_index];
+      std::fill_n(std::back_inserter(repetition_indices),
+                  runner.GetNumRepeats(), runner_index);
+    }
+    assert(repetition_indices.size() == num_repetitions_total &&
+           "Unexpected number of repetition indexes.");
+
+    if (FLAGS_benchmark_enable_random_interleaving) {
+      std::random_device rd;
+      std::mt19937 g(rd());
+      std::shuffle(repetition_indices.begin(), repetition_indices.end(), g);
+    }
+
+    for (size_t repetition_index : repetition_indices) {
+      internal::BenchmarkRunner& runner = runners[repetition_index];
+      runner.DoOneRepetition();
+      if (runner.HasRepeatsRemaining()) continue;
+      // FIXME: report each repetition separately, not all of them in bulk.
+
+      RunResults run_results = runner.GetResults();
+
+      // Maybe calculate complexity report
+      if (const auto* reports_for_family = runner.GetReportsForFamily()) {
+        if (reports_for_family->num_runs_done ==
+            reports_for_family->num_runs_total) {
+          auto additional_run_stats = ComputeBigO(reports_for_family->Runs);
+          run_results.aggregates_only.insert(run_results.aggregates_only.end(),
+                                             additional_run_stats.begin(),
+                                             additional_run_stats.end());
+          per_family_reports.erase(
+              (int)reports_for_family->Runs.front().family_index);
+        }
+      }

      Report(display_reporter, file_reporter, run_results);
    }
@ -471,6 +522,7 @@ void PrintUsageAndExit() {
          "          [--benchmark_filter=<regex>]\n"
          "          [--benchmark_min_time=<min_time>]\n"
          "          [--benchmark_repetitions=<num_repetitions>]\n"
+          "          [--benchmark_enable_random_interleaving={true|false}]\n"
          "          [--benchmark_report_aggregates_only={true|false}]\n"
          "          [--benchmark_display_aggregates_only={true|false}]\n"
          "          [--benchmark_format=<console|json|csv>]\n"
@ -495,6 +547,8 @@ void ParseCommandLineFlags(int* argc, char** argv) {
                        &FLAGS_benchmark_min_time) ||
        ParseInt32Flag(argv[i], "benchmark_repetitions",
                       &FLAGS_benchmark_repetitions) ||
+        ParseBoolFlag(argv[i], "benchmark_enable_random_interleaving",
+                      &FLAGS_benchmark_enable_random_interleaving) ||
        ParseBoolFlag(argv[i], "benchmark_report_aggregates_only",
                      &FLAGS_benchmark_report_aggregates_only) ||
        ParseBoolFlag(argv[i], "benchmark_display_aggregates_only",
--- a/src/benchmark_api_internal.h
+++ b/src/benchmark_api_internal.h
@ -39,8 +39,6 @@ class BenchmarkInstance {
  IterationCount iterations() const { return iterations_; }
  int threads() const { return threads_; }

-  bool last_benchmark_instance;
-
  State Run(IterationCount iters, int thread_id, internal::ThreadTimer* timer,
            internal::ThreadManager* manager,
            internal::PerfCountersMeasurement* perf_counters_measurement) const;
--- a/src/benchmark_register.cc
+++ b/src/benchmark_register.cc
@ -166,7 +166,6 @@ bool BenchmarkFamilies::FindBenchmarks(
        const auto full_name = instance.name().str();
        if ((re.Match(full_name) && !isNegativeFilter) ||
            (!re.Match(full_name) && isNegativeFilter)) {
-          instance.last_benchmark_instance = (&args == &family->args_.back());
          benchmarks->push_back(std::move(instance));

          ++per_family_instance_index;
--- a/src/benchmark_runner.cc
+++ b/src/benchmark_runner.cc
@ -143,9 +143,9 @@ void RunInThread(const BenchmarkInstance* b, IterationCount iters,

 BenchmarkRunner::BenchmarkRunner(
    const benchmark::internal::BenchmarkInstance& b_,
-    std::vector<BenchmarkReporter::Run>* complexity_reports_)
+    BenchmarkReporter::PerFamilyRunReports* reports_for_family_)
    : b(b_),
-      complexity_reports(complexity_reports_),
+      reports_for_family(reports_for_family_),
      min_time(!IsZero(b.min_time()) ? b.min_time() : FLAGS_benchmark_min_time),
      repeats(b.repetitions() != 0 ? b.repetitions()
                                   : FLAGS_benchmark_repetitions),
@ -172,22 +172,6 @@ BenchmarkRunner::BenchmarkRunner(
          perf_counters_measurement.IsValid())
        << "Perf counters were requested but could not be set up.";
  }
-
-  for (int repetition_num = 0; repetition_num < repeats; repetition_num++) {
-    DoOneRepetition(repetition_num);
-  }
-
-  // Calculate additional statistics
-  run_results.aggregates_only = ComputeStats(run_results.non_aggregates);
-
-  // Maybe calculate complexity report
-  if (complexity_reports && b.last_benchmark_instance) {
-    auto additional_run_stats = ComputeBigO(*complexity_reports);
-    run_results.aggregates_only.insert(run_results.aggregates_only.end(),
-                                       additional_run_stats.begin(),
-                                       additional_run_stats.end());
-    complexity_reports->clear();
-  }
 }

 BenchmarkRunner::IterationResults BenchmarkRunner::DoNIterations() {
@ -283,8 +267,10 @@ bool BenchmarkRunner::ShouldReportIterationResults(
         ((i.results.real_time_used >= 5 * min_time) && !b.use_manual_time());
 }

-void BenchmarkRunner::DoOneRepetition(int64_t repetition_index) {
-  const bool is_the_first_repetition = repetition_index == 0;
+void BenchmarkRunner::DoOneRepetition() {
+  assert(HasRepeatsRemaining() && "Already done all repetitions?");
+
+  const bool is_the_first_repetition = num_repetitions_done == 0;
  IterationResults i;

  // We *may* be gradually increasing the length (iteration count)
@ -337,19 +323,25 @@ void BenchmarkRunner::DoOneRepetition(int64_t repetition_index) {
  // Ok, now actualy report.
  BenchmarkReporter::Run report =
      CreateRunReport(b, i.results, memory_iterations, memory_result, i.seconds,
-                      repetition_index, repeats);
+                      num_repetitions_done, repeats);

-  if (complexity_reports && !report.error_occurred)
-    complexity_reports->push_back(report);
+  if (reports_for_family) {
+    ++reports_for_family->num_runs_done;
+    if (!report.error_occurred) reports_for_family->Runs.push_back(report);
+  }

  run_results.non_aggregates.push_back(report);
+
+  ++num_repetitions_done;
 }

-RunResults RunBenchmark(
-    const benchmark::internal::BenchmarkInstance& b,
-    std::vector<BenchmarkReporter::Run>* complexity_reports) {
-  internal::BenchmarkRunner r(b, complexity_reports);
-  return r.get_results();
+RunResults&& BenchmarkRunner::GetResults() {
+  assert(!HasRepeatsRemaining() && "Did not run all repetitions yet?");
+
+  // Calculate additional statistics over the repetitions of this instance.
+  run_results.aggregates_only = ComputeStats(run_results.non_aggregates);
+
+  return std::move(run_results);
 }

 }  // end namespace internal
--- a/src/benchmark_runner.h
+++ b/src/benchmark_runner.h
@ -50,20 +50,34 @@ struct RunResults {
 class BenchmarkRunner {
 public:
  BenchmarkRunner(const benchmark::internal::BenchmarkInstance& b_,
-                  std::vector<BenchmarkReporter::Run>* complexity_reports_);
+                  BenchmarkReporter::PerFamilyRunReports* reports_for_family);

-  RunResults&& get_results() { return std::move(run_results); }
+  int GetNumRepeats() const { return repeats; }
+
+  bool HasRepeatsRemaining() const {
+    return GetNumRepeats() != num_repetitions_done;
+  }
+
+  void DoOneRepetition();
+
+  RunResults&& GetResults();
+
+  BenchmarkReporter::PerFamilyRunReports* GetReportsForFamily() const {
+    return reports_for_family;
+  };

 private:
  RunResults run_results;

  const benchmark::internal::BenchmarkInstance& b;
-  std::vector<BenchmarkReporter::Run>* complexity_reports;
+  BenchmarkReporter::PerFamilyRunReports* reports_for_family;

  const double min_time;
  const int repeats;
  const bool has_explicit_iteration_count;

+  int num_repetitions_done = 0;
+
  std::vector<std::thread> pool;

  IterationCount iters;  // preserved between repetitions!
@ -83,14 +97,8 @@ class BenchmarkRunner {
  IterationCount PredictNumItersNeeded(const IterationResults& i) const;

  bool ShouldReportIterationResults(const IterationResults& i) const;
-
-  void DoOneRepetition(int64_t repetition_index);
 };

-RunResults RunBenchmark(
-    const benchmark::internal::BenchmarkInstance& b,
-    std::vector<BenchmarkReporter::Run>* complexity_reports);
-
 }  // namespace internal

 }  // end namespace benchmark
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@ -199,6 +199,7 @@ if (BENCHMARK_ENABLE_GTEST_TESTS)

  add_gtest(benchmark_gtest)
  add_gtest(benchmark_name_gtest)
+  add_gtest(benchmark_random_interleaving_gtest)
  add_gtest(commandlineflags_gtest)
  add_gtest(statistics_gtest)
  add_gtest(string_util_gtest)
--- a/test/benchmark_random_interleaving_gtest.cc
+++ b/test/benchmark_random_interleaving_gtest.cc
@ -0,0 +1,126 @@
+#include <queue>
+#include <string>
+#include <vector>
+
+#include "../src/commandlineflags.h"
+#include "../src/string_util.h"
+#include "benchmark/benchmark.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+DECLARE_bool(benchmark_enable_random_interleaving);
+DECLARE_string(benchmark_filter);
+DECLARE_int32(benchmark_repetitions);
+
+namespace benchmark {
+namespace internal {
+namespace {
+
+class EventQueue : public std::queue<std::string> {
+ public:
+  void Put(const std::string& event) { push(event); }
+
+  void Clear() {
+    while (!empty()) {
+      pop();
+    }
+  }
+
+  std::string Get() {
+    std::string event = front();
+    pop();
+    return event;
+  }
+};
+
+static EventQueue* queue = new EventQueue;
+
+class NullReporter : public BenchmarkReporter {
+ public:
+  bool ReportContext(const Context& /*context*/) override { return true; }
+  void ReportRuns(const std::vector<Run>& /* report */) override {}
+};
+
+class BenchmarkTest : public testing::Test {
+ public:
+  static void SetupHook(int /* num_threads */) { queue->push("Setup"); }
+
+  static void TeardownHook(int /* num_threads */) { queue->push("Teardown"); }
+
+  void Execute(const std::string& pattern) {
+    queue->Clear();
+
+    BenchmarkReporter* reporter = new NullReporter;
+    FLAGS_benchmark_filter = pattern;
+    RunSpecifiedBenchmarks(reporter);
+    delete reporter;
+
+    queue->Put("DONE");  // End marker
+  }
+};
+
+static void BM_Match1(benchmark::State& state) {
+  const int64_t arg = state.range(0);
+
+  for (auto _ : state) {
+  }
+  queue->Put(StrFormat("BM_Match1/%d", static_cast<int>(arg)));
+}
+BENCHMARK(BM_Match1)
+    ->Iterations(100)
+    ->Arg(1)
+    ->Arg(2)
+    ->Arg(3)
+    ->Range(10, 80)
+    ->Args({90})
+    ->Args({100});
+
+TEST_F(BenchmarkTest, Match1) {
+  Execute("BM_Match1");
+  ASSERT_EQ("BM_Match1/1", queue->Get());
+  ASSERT_EQ("BM_Match1/2", queue->Get());
+  ASSERT_EQ("BM_Match1/3", queue->Get());
+  ASSERT_EQ("BM_Match1/10", queue->Get());
+  ASSERT_EQ("BM_Match1/64", queue->Get());
+  ASSERT_EQ("BM_Match1/80", queue->Get());
+  ASSERT_EQ("BM_Match1/90", queue->Get());
+  ASSERT_EQ("BM_Match1/100", queue->Get());
+  ASSERT_EQ("DONE", queue->Get());
+}
+
+TEST_F(BenchmarkTest, Match1WithRepetition) {
+  FLAGS_benchmark_repetitions = 2;
+
+  Execute("BM_Match1/(64|80)");
+  ASSERT_EQ("BM_Match1/64", queue->Get());
+  ASSERT_EQ("BM_Match1/64", queue->Get());
+  ASSERT_EQ("BM_Match1/80", queue->Get());
+  ASSERT_EQ("BM_Match1/80", queue->Get());
+  ASSERT_EQ("DONE", queue->Get());
+}
+
+TEST_F(BenchmarkTest, Match1WithRandomInterleaving) {
+  FLAGS_benchmark_enable_random_interleaving = true;
+  FLAGS_benchmark_repetitions = 100;
+
+  std::map<std::string, int> element_count;
+  std::map<std::string, int> interleaving_count;
+  Execute("BM_Match1/(64|80)");
+  for (int i = 0; i < 100; ++i) {
+    std::vector<std::string> interleaving;
+    interleaving.push_back(queue->Get());
+    interleaving.push_back(queue->Get());
+    element_count[interleaving[0].c_str()]++;
+    element_count[interleaving[1].c_str()]++;
+    interleaving_count[StrFormat("%s,%s", interleaving[0].c_str(),
+                                 interleaving[1].c_str())]++;
+  }
+  EXPECT_EQ(element_count["BM_Match1/64"], 100) << "Unexpected repetitions.";
+  EXPECT_EQ(element_count["BM_Match1/80"], 100) << "Unexpected repetitions.";
+  EXPECT_GE(interleaving_count.size(), 2) << "Interleaving was not randomized.";
+  ASSERT_EQ("DONE", queue->Get());
+}
+
+}  // namespace
+}  // namespace internal
+}  // namespace benchmark