From fbc31405b2a2838560e155d7aec937e135ae0d81 Mon Sep 17 00:00:00 2001
From: Roman Lebedev <lebedev.ri@gmail.com>
Date: Thu, 3 Jun 2021 21:16:54 +0300
Subject: [PATCH] Random interleaving of benchmark repetitions - the sequel
 (fixes #1051) (#1163)

Inspired by the original implementation by Hai Huang @haih-g
from https://github.com/google/benchmark/pull/1105.

The original implementation had design deficiencies that
weren't really addressable without redesign, so it was reverted.

In essence, the original implementation consisted of two separateable parts:
* reducing the amount time each repetition is run for, and symmetrically increasing repetition count
* running the repetitions in random order

While it worked fine for the usual case, it broke down when user would specify repetitions
(it would completely ignore that request), or specified per-repetition min time (while it would
still adjust the repetition count, it would not adjust the per-repetition time,
leading to much greater run times)

Here, like i was originally suggesting in the original review, i'm separating the features,
and only dealing with a single one - running repetitions in random order.

Now that the runs/repetitions are no longer in-order, the tooling may wish to sort the output,
and indeed `compare.py` has been updated to do that: #1168.
---
 README.md                                   |   2 +
 docs/random_interleaving.md                 |  13 ++
 include/benchmark/benchmark.h               |  13 ++
 src/benchmark.cc                            |  74 ++++++++++--
 src/benchmark_api_internal.h                |   2 -
 src/benchmark_register.cc                   |   1 -
 src/benchmark_runner.cc                     |  48 ++++----
 src/benchmark_runner.h                      |  26 ++--
 test/CMakeLists.txt                         |   1 +
 test/benchmark_random_interleaving_gtest.cc | 126 ++++++++++++++++++++
 10 files changed, 256 insertions(+), 50 deletions(-)
 create mode 100644 docs/random_interleaving.md
 create mode 100644 test/benchmark_random_interleaving_gtest.cc
diff --git a/README.md b/README.md
index 79472294..aa61cef1 100644
--- a/README.md
+++ b/README.md
@@ -299,6 +299,8 @@ too (`-lkstat`).
 
 [Setting the Time Unit](#setting-the-time-unit)
 
+[Random Interleaving](docs/random_interleaving.md)
+
 [User-Requested Performance Counters](docs/perf_counters.md)
 
 [Preventing Optimization](#preventing-optimization)
diff --git a/docs/random_interleaving.md b/docs/random_interleaving.md
new file mode 100644
index 00000000..c0830368
--- /dev/null
+++ b/docs/random_interleaving.md
@@ -0,0 +1,13 @@
+<a name="interleaving" />
+
+# Random Interleaving
+
+[Random Interleaving](https://github.com/google/benchmark/issues/1051) is a
+technique to lower run-to-run variance. It randomly interleaves repetitions of a
+microbenchmark with repetitions from other microbenchmarks in the same benchmark
+test. Data shows it is able to lower run-to-run variance by
+[40%](https://github.com/google/benchmark/issues/1051) on average.
+
+To use, you mainly need to set `--benchmark_enable_random_interleaving=true`,
+and optionally specify non-zero repetition count `--benchmark_repetitions=9`
+and optionally decrease the per-repetition time `--benchmark_min_time=0.1`.
diff --git a/include/benchmark/benchmark.h b/include/benchmark/benchmark.h
index 290c5c50..9b548024 100644
--- a/include/benchmark/benchmark.h
+++ b/include/benchmark/benchmark.h
@@ -1472,6 +1472,19 @@ class BenchmarkReporter {
     int64_t max_bytes_used;
   };
 
+  struct PerFamilyRunReports {
+    PerFamilyRunReports() : num_runs_total(0), num_runs_done(0) {}
+
+    // How many runs will all instances of this benchmark perform?
+    int num_runs_total;
+
+    // How many runs have happened already?
+    int num_runs_done;
+
+    // The reports about (non-errneous!) runs of this family.
+    std::vector<BenchmarkReporter::Run> Runs;
+  };
+
   // Construct a BenchmarkReporter with the output stream set to 'std::cout'
   // and the error stream set to 'std::cerr'
   BenchmarkReporter();
diff --git a/src/benchmark.cc b/src/benchmark.cc
index c4392bfd..89f64967 100644
--- a/src/benchmark.cc
+++ b/src/benchmark.cc
@@ -33,8 +33,10 @@
 #include <cstdlib>
 #include <fstream>
 #include <iostream>
+#include <limits>
 #include <map>
 #include <memory>
+#include <random>
 #include <string>
 #include <thread>
 #include <utility>
@@ -73,6 +75,10 @@ DEFINE_double(benchmark_min_time, 0.5);
 // standard deviation of the runs will be reported.
 DEFINE_int32(benchmark_repetitions, 1);
 
+// If set, enable random interleaving of repetitions of all benchmarks.
+// See http://github.com/google/benchmark/issues/1051 for details.
+DEFINE_bool(benchmark_enable_random_interleaving, false);
+
 // Report the result of each benchmark repetitions. When 'true' is specified
 // only the mean, standard deviation, and other statistics are reported for
 // repeated benchmarks. Affects all reporters.
@@ -297,23 +303,68 @@ void RunBenchmarks(const std::vector<BenchmarkInstance>& benchmarks,
   context.name_field_width = name_field_width;
 
   // Keep track of running times of all instances of each benchmark family.
-  std::map<int /*family_index*/, std::vector<BenchmarkReporter::Run>>
-      complexity_reports;
+  std::map<int /*family_index*/, BenchmarkReporter::PerFamilyRunReports>
+      per_family_reports;
 
   if (display_reporter->ReportContext(context) &&
       (!file_reporter || file_reporter->ReportContext(context))) {
     FlushStreams(display_reporter);
     FlushStreams(file_reporter);
 
-    for (const BenchmarkInstance& benchmark : benchmarks) {
-      std::vector<BenchmarkReporter::Run>* complexity_reports_for_family =
-          nullptr;
-      if (benchmark.complexity() != oNone)
-        complexity_reports_for_family =
-            &complexity_reports[benchmark.family_index()];
+    size_t num_repetitions_total = 0;
 
-      RunResults run_results =
-          RunBenchmark(benchmark, complexity_reports_for_family);
+    std::vector<internal::BenchmarkRunner> runners;
+    runners.reserve(benchmarks.size());
+    for (const BenchmarkInstance& benchmark : benchmarks) {
+      BenchmarkReporter::PerFamilyRunReports* reports_for_family = nullptr;
+      if (benchmark.complexity() != oNone)
+        reports_for_family = &per_family_reports[benchmark.family_index()];
+
+      runners.emplace_back(benchmark, reports_for_family);
+      int num_repeats_of_this_instance = runners.back().GetNumRepeats();
+      num_repetitions_total += num_repeats_of_this_instance;
+      if (reports_for_family)
+        reports_for_family->num_runs_total += num_repeats_of_this_instance;
+    }
+    assert(runners.size() == benchmarks.size() && "Unexpected runner count.");
+
+    std::vector<int> repetition_indices;
+    repetition_indices.reserve(num_repetitions_total);
+    for (size_t runner_index = 0, num_runners = runners.size();
+         runner_index != num_runners; ++runner_index) {
+      const internal::BenchmarkRunner& runner = runners[runner_index];
+      std::fill_n(std::back_inserter(repetition_indices),
+                  runner.GetNumRepeats(), runner_index);
+    }
+    assert(repetition_indices.size() == num_repetitions_total &&
+           "Unexpected number of repetition indexes.");
+
+    if (FLAGS_benchmark_enable_random_interleaving) {
+      std::random_device rd;
+      std::mt19937 g(rd());
+      std::shuffle(repetition_indices.begin(), repetition_indices.end(), g);
+    }
+
+    for (size_t repetition_index : repetition_indices) {
+      internal::BenchmarkRunner& runner = runners[repetition_index];
+      runner.DoOneRepetition();
+      if (runner.HasRepeatsRemaining()) continue;
+      // FIXME: report each repetition separately, not all of them in bulk.
+
+      RunResults run_results = runner.GetResults();
+
+      // Maybe calculate complexity report
+      if (const auto* reports_for_family = runner.GetReportsForFamily()) {
+        if (reports_for_family->num_runs_done ==
+            reports_for_family->num_runs_total) {
+          auto additional_run_stats = ComputeBigO(reports_for_family->Runs);
+          run_results.aggregates_only.insert(run_results.aggregates_only.end(),
+                                             additional_run_stats.begin(),
+                                             additional_run_stats.end());
+          per_family_reports.erase(
+              (int)reports_for_family->Runs.front().family_index);
+        }
+      }
 
       Report(display_reporter, file_reporter, run_results);
     }
@@ -471,6 +522,7 @@ void PrintUsageAndExit() {
           "          [--benchmark_filter=<regex>]\n"
           "          [--benchmark_min_time=<min_time>]\n"
           "          [--benchmark_repetitions=<num_repetitions>]\n"
+          "          [--benchmark_enable_random_interleaving={true|false}]\n"
           "          [--benchmark_report_aggregates_only={true|false}]\n"
           "          [--benchmark_display_aggregates_only={true|false}]\n"
           "          [--benchmark_format=<console|json|csv>]\n"
@@ -495,6 +547,8 @@ void ParseCommandLineFlags(int* argc, char** argv) {
                         &FLAGS_benchmark_min_time) ||
         ParseInt32Flag(argv[i], "benchmark_repetitions",
                        &FLAGS_benchmark_repetitions) ||
+        ParseBoolFlag(argv[i], "benchmark_enable_random_interleaving",
+                      &FLAGS_benchmark_enable_random_interleaving) ||
         ParseBoolFlag(argv[i], "benchmark_report_aggregates_only",
                       &FLAGS_benchmark_report_aggregates_only) ||
         ParseBoolFlag(argv[i], "benchmark_display_aggregates_only",
diff --git a/src/benchmark_api_internal.h b/src/benchmark_api_internal.h
index e2afbd83..9296b7d2 100644
--- a/src/benchmark_api_internal.h
+++ b/src/benchmark_api_internal.h
@@ -39,8 +39,6 @@ class BenchmarkInstance {
   IterationCount iterations() const { return iterations_; }
   int threads() const { return threads_; }
 
-  bool last_benchmark_instance;
-
   State Run(IterationCount iters, int thread_id, internal::ThreadTimer* timer,
             internal::ThreadManager* manager,
             internal::PerfCountersMeasurement* perf_counters_measurement) const;
diff --git a/src/benchmark_register.cc b/src/benchmark_register.cc
index 435c48b4..57446222 100644
--- a/src/benchmark_register.cc
+++ b/src/benchmark_register.cc
@@ -166,7 +166,6 @@ bool BenchmarkFamilies::FindBenchmarks(
         const auto full_name = instance.name().str();
         if ((re.Match(full_name) && !isNegativeFilter) ||
             (!re.Match(full_name) && isNegativeFilter)) {
-          instance.last_benchmark_instance = (&args == &family->args_.back());
           benchmarks->push_back(std::move(instance));
 
           ++per_family_instance_index;
diff --git a/src/benchmark_runner.cc b/src/benchmark_runner.cc
index 55d6cf15..6742d42d 100644
--- a/src/benchmark_runner.cc
+++ b/src/benchmark_runner.cc
@@ -143,9 +143,9 @@ void RunInThread(const BenchmarkInstance* b, IterationCount iters,
 
 BenchmarkRunner::BenchmarkRunner(
     const benchmark::internal::BenchmarkInstance& b_,
-    std::vector<BenchmarkReporter::Run>* complexity_reports_)
+    BenchmarkReporter::PerFamilyRunReports* reports_for_family_)
     : b(b_),
-      complexity_reports(complexity_reports_),
+      reports_for_family(reports_for_family_),
       min_time(!IsZero(b.min_time()) ? b.min_time() : FLAGS_benchmark_min_time),
       repeats(b.repetitions() != 0 ? b.repetitions()
                                    : FLAGS_benchmark_repetitions),
@@ -172,22 +172,6 @@ BenchmarkRunner::BenchmarkRunner(
           perf_counters_measurement.IsValid())
         << "Perf counters were requested but could not be set up.";
   }
-
-  for (int repetition_num = 0; repetition_num < repeats; repetition_num++) {
-    DoOneRepetition(repetition_num);
-  }
-
-  // Calculate additional statistics
-  run_results.aggregates_only = ComputeStats(run_results.non_aggregates);
-
-  // Maybe calculate complexity report
-  if (complexity_reports && b.last_benchmark_instance) {
-    auto additional_run_stats = ComputeBigO(*complexity_reports);
-    run_results.aggregates_only.insert(run_results.aggregates_only.end(),
-                                       additional_run_stats.begin(),
-                                       additional_run_stats.end());
-    complexity_reports->clear();
-  }
 }
 
 BenchmarkRunner::IterationResults BenchmarkRunner::DoNIterations() {
@@ -283,8 +267,10 @@ bool BenchmarkRunner::ShouldReportIterationResults(
          ((i.results.real_time_used >= 5 * min_time) && !b.use_manual_time());
 }
 
-void BenchmarkRunner::DoOneRepetition(int64_t repetition_index) {
-  const bool is_the_first_repetition = repetition_index == 0;
+void BenchmarkRunner::DoOneRepetition() {
+  assert(HasRepeatsRemaining() && "Already done all repetitions?");
+
+  const bool is_the_first_repetition = num_repetitions_done == 0;
   IterationResults i;
 
   // We *may* be gradually increasing the length (iteration count)
@@ -337,19 +323,25 @@ void BenchmarkRunner::DoOneRepetition(int64_t repetition_index) {
   // Ok, now actualy report.
   BenchmarkReporter::Run report =
       CreateRunReport(b, i.results, memory_iterations, memory_result, i.seconds,
-                      repetition_index, repeats);
+                      num_repetitions_done, repeats);
 
-  if (complexity_reports && !report.error_occurred)
-    complexity_reports->push_back(report);
+  if (reports_for_family) {
+    ++reports_for_family->num_runs_done;
+    if (!report.error_occurred) reports_for_family->Runs.push_back(report);
+  }
 
   run_results.non_aggregates.push_back(report);
+
+  ++num_repetitions_done;
 }
 
-RunResults RunBenchmark(
-    const benchmark::internal::BenchmarkInstance& b,
-    std::vector<BenchmarkReporter::Run>* complexity_reports) {
-  internal::BenchmarkRunner r(b, complexity_reports);
-  return r.get_results();
+RunResults&& BenchmarkRunner::GetResults() {
+  assert(!HasRepeatsRemaining() && "Did not run all repetitions yet?");
+
+  // Calculate additional statistics over the repetitions of this instance.
+  run_results.aggregates_only = ComputeStats(run_results.non_aggregates);
+
+  return std::move(run_results);
 }
 
 }  // end namespace internal
diff --git a/src/benchmark_runner.h b/src/benchmark_runner.h
index 9730ad38..8a855236 100644
--- a/src/benchmark_runner.h
+++ b/src/benchmark_runner.h
@@ -50,20 +50,34 @@ struct RunResults {
 class BenchmarkRunner {
  public:
   BenchmarkRunner(const benchmark::internal::BenchmarkInstance& b_,
-                  std::vector<BenchmarkReporter::Run>* complexity_reports_);
+                  BenchmarkReporter::PerFamilyRunReports* reports_for_family);
 
-  RunResults&& get_results() { return std::move(run_results); }
+  int GetNumRepeats() const { return repeats; }
+
+  bool HasRepeatsRemaining() const {
+    return GetNumRepeats() != num_repetitions_done;
+  }
+
+  void DoOneRepetition();
+
+  RunResults&& GetResults();
+
+  BenchmarkReporter::PerFamilyRunReports* GetReportsForFamily() const {
+    return reports_for_family;
+  };
 
  private:
   RunResults run_results;
 
   const benchmark::internal::BenchmarkInstance& b;
-  std::vector<BenchmarkReporter::Run>* complexity_reports;
+  BenchmarkReporter::PerFamilyRunReports* reports_for_family;
 
   const double min_time;
   const int repeats;
   const bool has_explicit_iteration_count;
 
+  int num_repetitions_done = 0;
+
   std::vector<std::thread> pool;
 
   IterationCount iters;  // preserved between repetitions!
@@ -83,14 +97,8 @@ class BenchmarkRunner {
   IterationCount PredictNumItersNeeded(const IterationResults& i) const;
 
   bool ShouldReportIterationResults(const IterationResults& i) const;
-
-  void DoOneRepetition(int64_t repetition_index);
 };
 
-RunResults RunBenchmark(
-    const benchmark::internal::BenchmarkInstance& b,
-    std::vector<BenchmarkReporter::Run>* complexity_reports);
-
 }  // namespace internal
 
 }  // end namespace benchmark
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 012f5a8b..79cdf53b 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -199,6 +199,7 @@ if (BENCHMARK_ENABLE_GTEST_TESTS)
 
   add_gtest(benchmark_gtest)
   add_gtest(benchmark_name_gtest)
+  add_gtest(benchmark_random_interleaving_gtest)
   add_gtest(commandlineflags_gtest)
   add_gtest(statistics_gtest)
   add_gtest(string_util_gtest)
diff --git a/test/benchmark_random_interleaving_gtest.cc b/test/benchmark_random_interleaving_gtest.cc
new file mode 100644
index 00000000..8e28dab3
--- /dev/null
+++ b/test/benchmark_random_interleaving_gtest.cc
@@ -0,0 +1,126 @@
+#include <queue>
+#include <string>
+#include <vector>
+
+#include "../src/commandlineflags.h"
+#include "../src/string_util.h"
+#include "benchmark/benchmark.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+DECLARE_bool(benchmark_enable_random_interleaving);
+DECLARE_string(benchmark_filter);
+DECLARE_int32(benchmark_repetitions);
+
+namespace benchmark {
+namespace internal {
+namespace {
+
+class EventQueue : public std::queue<std::string> {
+ public:
+  void Put(const std::string& event) { push(event); }
+
+  void Clear() {
+    while (!empty()) {
+      pop();
+    }
+  }
+
+  std::string Get() {
+    std::string event = front();
+    pop();
+    return event;
+  }
+};
+
+static EventQueue* queue = new EventQueue;
+
+class NullReporter : public BenchmarkReporter {
+ public:
+  bool ReportContext(const Context& /*context*/) override { return true; }
+  void ReportRuns(const std::vector<Run>& /* report */) override {}
+};
+
+class BenchmarkTest : public testing::Test {
+ public:
+  static void SetupHook(int /* num_threads */) { queue->push("Setup"); }
+
+  static void TeardownHook(int /* num_threads */) { queue->push("Teardown"); }
+
+  void Execute(const std::string& pattern) {
+    queue->Clear();
+
+    BenchmarkReporter* reporter = new NullReporter;
+    FLAGS_benchmark_filter = pattern;
+    RunSpecifiedBenchmarks(reporter);
+    delete reporter;
+
+    queue->Put("DONE");  // End marker
+  }
+};
+
+static void BM_Match1(benchmark::State& state) {
+  const int64_t arg = state.range(0);
+
+  for (auto _ : state) {
+  }
+  queue->Put(StrFormat("BM_Match1/%d", static_cast<int>(arg)));
+}
+BENCHMARK(BM_Match1)
+    ->Iterations(100)
+    ->Arg(1)
+    ->Arg(2)
+    ->Arg(3)
+    ->Range(10, 80)
+    ->Args({90})
+    ->Args({100});
+
+TEST_F(BenchmarkTest, Match1) {
+  Execute("BM_Match1");
+  ASSERT_EQ("BM_Match1/1", queue->Get());
+  ASSERT_EQ("BM_Match1/2", queue->Get());
+  ASSERT_EQ("BM_Match1/3", queue->Get());
+  ASSERT_EQ("BM_Match1/10", queue->Get());
+  ASSERT_EQ("BM_Match1/64", queue->Get());
+  ASSERT_EQ("BM_Match1/80", queue->Get());
+  ASSERT_EQ("BM_Match1/90", queue->Get());
+  ASSERT_EQ("BM_Match1/100", queue->Get());
+  ASSERT_EQ("DONE", queue->Get());
+}
+
+TEST_F(BenchmarkTest, Match1WithRepetition) {
+  FLAGS_benchmark_repetitions = 2;
+
+  Execute("BM_Match1/(64|80)");
+  ASSERT_EQ("BM_Match1/64", queue->Get());
+  ASSERT_EQ("BM_Match1/64", queue->Get());
+  ASSERT_EQ("BM_Match1/80", queue->Get());
+  ASSERT_EQ("BM_Match1/80", queue->Get());
+  ASSERT_EQ("DONE", queue->Get());
+}
+
+TEST_F(BenchmarkTest, Match1WithRandomInterleaving) {
+  FLAGS_benchmark_enable_random_interleaving = true;
+  FLAGS_benchmark_repetitions = 100;
+
+  std::map<std::string, int> element_count;
+  std::map<std::string, int> interleaving_count;
+  Execute("BM_Match1/(64|80)");
+  for (int i = 0; i < 100; ++i) {
+    std::vector<std::string> interleaving;
+    interleaving.push_back(queue->Get());
+    interleaving.push_back(queue->Get());
+    element_count[interleaving[0].c_str()]++;
+    element_count[interleaving[1].c_str()]++;
+    interleaving_count[StrFormat("%s,%s", interleaving[0].c_str(),
+                                 interleaving[1].c_str())]++;
+  }
+  EXPECT_EQ(element_count["BM_Match1/64"], 100) << "Unexpected repetitions.";
+  EXPECT_EQ(element_count["BM_Match1/80"], 100) << "Unexpected repetitions.";
+  EXPECT_GE(interleaving_count.size(), 2) << "Interleaving was not randomized.";
+  ASSERT_EQ("DONE", queue->Get());
+}
+
+}  // namespace
+}  // namespace internal
+}  // namespace benchmark