Refactoring of PerfCounters infrastructure (#1559)

* Refactoring of PerfCounters infrastructure The main feature in this pull request is the removal of the static sharing of PerfCounters and instead creating them at the top `RunBenchmarks()` function where all benchmark runners are created. A single PerfCountersMeasurement object is created and then shared with all the new BenchmarkRunners objects, one per existing benchmark. Other features conflated here in this PR are: - Added BENCHMARK_DONT_OPTIMIZE macro in global scope - Removal of the `IsValid()` query, being replaced by checking the number of remaining counters after validity tests - Refactoring of all GTests to reflect the changes and new semantics - extra comments throughout the new code to clarify intent It was extremely hard to separate all those features in different PRs as requested since they are so interdependent on each other so I'm just pushing them altogether and asking for forgiveness. This PR comes replacing PRs 1555 and 1558 which have been closed. * Fixed whitespace issue with clang-format My clang-format insists in deleting this single white space on line 601 while Github's clang format breaks when it is added. I had to disable format-on-save to check-in this revert change. I'm using clang 14.0.6.
2023-03-07 04:27:52 -06:00 · 2023-03-07 04:27:52 -06:00 · fbc6efa9b5
parent 9885aefb96
commit fbc6efa9b5
7 changed files with 261 additions and 179 deletions
--- a/include/benchmark/benchmark.h
+++ b/include/benchmark/benchmark.h
@ -218,6 +218,18 @@ BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
 #define BENCHMARK_UNUSED
 #endif

+// Used to annotate functions, methods and classes so they
+// are not optimized by the compiler. Useful for tests
+// where you expect loops to stay in place churning cycles
+#if defined(__clang__)
+#define BENCHMARK_DONT_OPTIMIZE __attribute__((optnone))
+#elif defined(__GNUC__) || defined(__GNUG__)
+#define BENCHMARK_DONT_OPTIMIZE __attribute__((optimize(0)))
+#else
+// MSVC & Intel do not have a no-optimize attribute, only line pragmas
+#define BENCHMARK_DONT_OPTIMIZE
+#endif
+
 #if defined(__GNUC__) || defined(__clang__)
 #define BENCHMARK_ALWAYS_INLINE __attribute__((always_inline))
 #elif defined(_MSC_VER) && !defined(__clang__)
--- a/src/benchmark.cc
+++ b/src/benchmark.cc
@ -348,14 +348,26 @@ void RunBenchmarks(const std::vector<BenchmarkInstance>& benchmarks,

    size_t num_repetitions_total = 0;

+    // This perfcounters object needs to be created before the runners vector
+    // below so it outlasts their lifetime.
+    PerfCountersMeasurement perfcounters(
+        StrSplit(FLAGS_benchmark_perf_counters, ','));
+
+    // Vector of benchmarks to run
    std::vector<internal::BenchmarkRunner> runners;
    runners.reserve(benchmarks.size());
+
+    // Count the number of benchmarks with threads to warn the user in case
+    // performance counters are used.
+    int benchmarks_with_threads = 0;
+
+    // Loop through all benchmarks
    for (const BenchmarkInstance& benchmark : benchmarks) {
      BenchmarkReporter::PerFamilyRunReports* reports_for_family = nullptr;
      if (benchmark.complexity() != oNone)
        reports_for_family = &per_family_reports[benchmark.family_index()];
-
-      runners.emplace_back(benchmark, reports_for_family);
+      benchmarks_with_threads += (benchmark.threads() > 0);
+      runners.emplace_back(benchmark, &perfcounters, reports_for_family);
      int num_repeats_of_this_instance = runners.back().GetNumRepeats();
      num_repetitions_total += num_repeats_of_this_instance;
      if (reports_for_family)
@ -363,6 +375,17 @@ void RunBenchmarks(const std::vector<BenchmarkInstance>& benchmarks,
    }
    assert(runners.size() == benchmarks.size() && "Unexpected runner count.");

+    // The use of performance counters with threads would be unintuitive for
+    // the average user so we need to warn them about this case
+    if ((benchmarks_with_threads > 0) && (perfcounters.num_counters() > 0)) {
+      GetErrorLogInstance()
+          << "***WARNING*** There are " << benchmarks_with_threads
+          << " benchmarks with threads and " << perfcounters.num_counters()
+          << " performance counters were requested. Beware counters will "
+             "reflect the combined usage across all "
+             "threads.\n";
+    }
+
    std::vector<size_t> repetition_indices;
    repetition_indices.reserve(num_repetitions_total);
    for (size_t runner_index = 0, num_runners = runners.size();
--- a/src/benchmark_runner.cc
+++ b/src/benchmark_runner.cc
@ -221,6 +221,7 @@ BenchTimeType ParseBenchMinTime(const std::string& value) {

 BenchmarkRunner::BenchmarkRunner(
    const benchmark::internal::BenchmarkInstance& b_,
+    PerfCountersMeasurement* pcm_,
    BenchmarkReporter::PerFamilyRunReports* reports_for_family_)
    : b(b_),
      reports_for_family(reports_for_family_),
@ -239,10 +240,7 @@ BenchmarkRunner::BenchmarkRunner(
      iters(has_explicit_iteration_count
                ? ComputeIters(b_, parsed_benchtime_flag)
                : 1),
-      perf_counters_measurement(StrSplit(FLAGS_benchmark_perf_counters, ',')),
-      perf_counters_measurement_ptr(perf_counters_measurement.IsValid()
-                                        ? &perf_counters_measurement
-                                        : nullptr) {
+      perf_counters_measurement_ptr(pcm_) {
  run_results.display_report_aggregates_only =
      (FLAGS_benchmark_report_aggregates_only ||
       FLAGS_benchmark_display_aggregates_only);
@ -255,7 +253,7 @@ BenchmarkRunner::BenchmarkRunner(
    run_results.file_report_aggregates_only =
        (b.aggregation_report_mode() & internal::ARM_FileReportAggregatesOnly);
    BM_CHECK(FLAGS_benchmark_perf_counters.empty() ||
-             perf_counters_measurement.IsValid())
+             (perf_counters_measurement_ptr->num_counters() == 0))
        << "Perf counters were requested but could not be set up.";
  }
 }
--- a/src/benchmark_runner.h
+++ b/src/benchmark_runner.h
@ -58,6 +58,7 @@ BenchTimeType ParseBenchMinTime(const std::string& value);
 class BenchmarkRunner {
 public:
  BenchmarkRunner(const benchmark::internal::BenchmarkInstance& b_,
+                  benchmark::internal::PerfCountersMeasurement* pmc_,
                  BenchmarkReporter::PerFamilyRunReports* reports_for_family);

  int GetNumRepeats() const { return repeats; }
@ -103,8 +104,7 @@ class BenchmarkRunner {
  // So only the first repetition has to find/calculate it,
  // the other repetitions will just use that precomputed iteration count.

-  PerfCountersMeasurement perf_counters_measurement;
-  PerfCountersMeasurement* const perf_counters_measurement_ptr;
+  PerfCountersMeasurement* const perf_counters_measurement_ptr = nullptr;

  struct IterationResults {
    internal::ThreadManager::Result results;
--- a/src/perf_counters.cc
+++ b/src/perf_counters.cc
@ -71,80 +71,78 @@ bool PerfCounters::IsCounterSupported(const std::string& name) {
  return (ret == PFM_SUCCESS);
 }

-// Validates all counter names passed, returning only the valid ones
-static std::vector<std::string> validateCounters(
-    const std::vector<std::string>& counter_names) {
-  // All valid names to be returned
-  std::vector<std::string> valid_names;
-
-  // Loop through all the given names
-  int invalid_counter = 0;
-  for (const std::string& name : counter_names) {
-    // Check trivial empty
-    if (name.empty()) {
-      GetErrorLogInstance() << "A counter name was the empty string\n";
-      invalid_counter++;
-      continue;
-    }
-    if (PerfCounters::IsCounterSupported(name)) {
-      // we are about to push into the valid names vector
-      // check if we did not reach the maximum
-      if (valid_names.size() == PerfCounterValues::kMaxCounters) {
-        GetErrorLogInstance()
-            << counter_names.size()
-            << " counters were requested. The maximum is "
-            << PerfCounterValues::kMaxCounters << " and "
-            << counter_names.size() - invalid_counter - valid_names.size()
-            << " will be ignored\n";
-        // stop the loop and return what we have already
-        break;
-      }
-      valid_names.push_back(name);
-    } else {
-      GetErrorLogInstance() << "Performance counter " << name
-                            << " incorrect or not supported on this platform\n";
-      invalid_counter++;
-    }
-  }
-  // RVO should take care of this
-  return valid_names;
-}
-
 PerfCounters PerfCounters::Create(
    const std::vector<std::string>& counter_names) {
-  std::vector<std::string> valid_names = validateCounters(counter_names);
-  if (valid_names.empty()) {
-    return NoCounters();
-  }
-  std::vector<int> counter_ids(valid_names.size());
+  // Valid counters will populate these arrays but we start empty
+  std::vector<std::string> valid_names;
+  std::vector<int> counter_ids;
  std::vector<int> leader_ids;

-  const int mode = PFM_PLM3;  // user mode only
+  // Resize to the maximum possible
+  valid_names.reserve(counter_names.size());
+  counter_ids.reserve(counter_names.size());
+
+  const int kCounterMode = PFM_PLM3;  // user mode only
+
+  // Group leads will be assigned on demand. The idea is that once we cannot
+  // create a counter descriptor, the reason is that this group has maxed out
+  // so we set the group_id again to -1 and retry - giving the algorithm a
+  // chance to create a new group leader to hold the next set of counters.
  int group_id = -1;
-  for (size_t i = 0; i < valid_names.size(); ++i) {
+
+  // Loop through all performance counters
+  for (size_t i = 0; i < counter_names.size(); ++i) {
+    // we are about to push into the valid names vector
+    // check if we did not reach the maximum
+    if (valid_names.size() == PerfCounterValues::kMaxCounters) {
+      // Log a message if we maxed out and stop adding
+      GetErrorLogInstance()
+          << counter_names.size() << " counters were requested. The maximum is "
+          << PerfCounterValues::kMaxCounters << " and " << valid_names.size()
+          << " were already added. All remaining counters will be ignored\n";
+      // stop the loop and return what we have already
+      break;
+    }
+
+    // Check if this name is empty
+    const auto& name = counter_names[i];
+    if (name.empty()) {
+      GetErrorLogInstance()
+          << "A performance counter name was the empty string\n";
+      continue;
+    }
+
+    // Here first means first in group, ie the group leader
    const bool is_first = (group_id < 0);
+
+    // This struct will be populated by libpfm from the counter string
+    // and then fed into the syscall perf_event_open
    struct perf_event_attr attr {};
    attr.size = sizeof(attr);
-    const auto& name = valid_names[i];
+
+    // This is the input struct to libpfm.
    pfm_perf_encode_arg_t arg{};
    arg.attr = &attr;
-
-    const int pfm_get =
-        pfm_get_os_event_encoding(name.c_str(), mode, PFM_OS_PERF_EVENT, &arg);
+    const int pfm_get = pfm_get_os_event_encoding(name.c_str(), kCounterMode,
+                                                  PFM_OS_PERF_EVENT, &arg);
    if (pfm_get != PFM_SUCCESS) {
-      GetErrorLogInstance() << "Unknown counter name: " << name << "\n";
-      return NoCounters();
+      GetErrorLogInstance()
+          << "Unknown performance counter name: " << name << "\n";
+      continue;
    }
-    attr.disabled = is_first;
+
+    // We then proceed to populate the remaining fields in our attribute struct
    // Note: the man page for perf_event_create suggests inherit = true and
    // read_format = PERF_FORMAT_GROUP don't work together, but that's not the
    // case.
+    attr.disabled = is_first;
    attr.inherit = true;
    attr.pinned = is_first;
    attr.exclude_kernel = true;
    attr.exclude_user = false;
    attr.exclude_hv = true;
-    // Read all counters in one read.
+
+    // Read all counters in a group in one read.
    attr.read_format = PERF_FORMAT_GROUP;

    int id = -1;
@ -159,36 +157,64 @@ PerfCounters PerfCounters::Create(
        }
      }
      if (id < 0) {
-        // We reached a limit perhaps?
+        // If the file descriptor is negative we might have reached a limit
+        // in the current group. Set the group_id to -1 and retry
        if (group_id >= 0) {
          // Create a new group
          group_id = -1;
        } else {
-          // Give up, there is nothing else to try
+          // At this point we have already retried to set a new group id and
+          // failed. We then give up.
          break;
        }
      }
    }
+
+    // We failed to get a new file descriptor. We might have reached a hard
+    // hardware limit that cannot be resolved even with group multiplexing
    if (id < 0) {
-      GetErrorLogInstance()
-          << "Failed to get a file descriptor for " << name << "\n";
-      return NoCounters();
+      GetErrorLogInstance() << "***WARNING** Failed to get a file descriptor "
+                               "for performance counter "
+                            << name << ". Ignoring\n";
+
+      // We give up on this counter but try to keep going
+      // as the others would be fine
+      continue;
    }
    if (group_id < 0) {
-      // This is a leader, store and assign it
+      // This is a leader, store and assign it to the current file descriptor
      leader_ids.push_back(id);
      group_id = id;
    }
-    counter_ids[i] = id;
+    // This is a valid counter, add it to our descriptor's list
+    counter_ids.push_back(id);
+    valid_names.push_back(name);
  }
+
+  // Loop through all group leaders activating them
+  // There is another option of starting ALL counters in a process but
+  // that would be far reaching an intrusion. If the user is using PMCs
+  // by themselves then this would have a side effect on them. It is
+  // friendlier to loop through all groups individually.
  for (int lead : leader_ids) {
    if (ioctl(lead, PERF_EVENT_IOC_ENABLE) != 0) {
-      GetErrorLogInstance() << "Failed to start counters\n";
+      // This should never happen but if it does, we give up on the
+      // entire batch as recovery would be a mess.
+      GetErrorLogInstance() << "***WARNING*** Failed to start counters. "
+                               "Claring out all counters.\n";
+
+      // Close all peformance counters
+      for (int id : counter_ids) {
+        ::close(id);
+      }
+
+      // Return an empty object so our internal state is still good and
+      // the process can continue normally without impact
      return NoCounters();
    }
  }

-  return PerfCounters(valid_names, std::move(counter_ids),
+  return PerfCounters(std::move(valid_names), std::move(counter_ids),
                      std::move(leader_ids));
 }

@ -223,34 +249,10 @@ PerfCounters PerfCounters::Create(
 void PerfCounters::CloseCounters() const {}
 #endif  // defined HAVE_LIBPFM

-Mutex PerfCountersMeasurement::mutex_;
-int PerfCountersMeasurement::ref_count_ = 0;
-PerfCounters PerfCountersMeasurement::counters_ = PerfCounters::NoCounters();
-
-// The validation in PerfCounter::Create will create less counters than passed
-// so it should be okay to initialize start_values_ and end_values_ with the
-// upper bound as passed
 PerfCountersMeasurement::PerfCountersMeasurement(
    const std::vector<std::string>& counter_names)
    : start_values_(counter_names.size()), end_values_(counter_names.size()) {
-  MutexLock l(mutex_);
-  if (ref_count_ == 0) {
-    counters_ = PerfCounters::Create(counter_names);
-  }
-  // We chose to increment it even if `counters_` ends up invalid,
-  // so that we don't keep trying to create, and also since the dtor
-  // will decrement regardless of `counters_`'s validity
-  ++ref_count_;
-
-  BM_CHECK(!counters_.IsValid() || counters_.names() == counter_names);
-}
-
-PerfCountersMeasurement::~PerfCountersMeasurement() {
-  MutexLock l(mutex_);
-  --ref_count_;
-  if (ref_count_ == 0) {
-    counters_ = PerfCounters::NoCounters();
-  }
+  counters_ = PerfCounters::Create(counter_names);
 }

 PerfCounters& PerfCounters::operator=(PerfCounters&& other) noexcept {
--- a/src/perf_counters.h
+++ b/src/perf_counters.h
@ -90,10 +90,11 @@ class BENCHMARK_EXPORT PerfCounters final {
  // True iff this platform supports performance counters.
  static const bool kSupported;

-  bool IsValid() const { return !counter_names_.empty(); }
+  // Returns an empty object
  static PerfCounters NoCounters() { return PerfCounters(); }

  ~PerfCounters() { CloseCounters(); }
+  PerfCounters() = default;
  PerfCounters(PerfCounters&&) = default;
  PerfCounters(const PerfCounters&) = delete;
  PerfCounters& operator=(PerfCounters&&) noexcept;
@ -110,8 +111,8 @@ class BENCHMARK_EXPORT PerfCounters final {
  // Return a PerfCounters object ready to read the counters with the names
  // specified. The values are user-mode only. The counter name format is
  // implementation and OS specific.
-  // TODO: once we move to C++-17, this should be a std::optional, and then the
-  // IsValid() boolean can be dropped.
+  // In case of failure, this method will in the worst case return an
+  // empty object whose state will still be valid.
  static PerfCounters Create(const std::vector<std::string>& counter_names);

  // Take a snapshot of the current value of the counters into the provided
@ -120,7 +121,6 @@ class BENCHMARK_EXPORT PerfCounters final {
  BENCHMARK_ALWAYS_INLINE bool Snapshot(PerfCounterValues* values) const {
 #ifndef BENCHMARK_OS_WINDOWS
    assert(values != nullptr);
-    assert(IsValid());
    return values->Read(leader_ids_) == counter_ids_.size();
 #else
    (void)values;
@ -137,7 +137,6 @@ class BENCHMARK_EXPORT PerfCounters final {
      : counter_ids_(std::move(counter_ids)),
        leader_ids_(std::move(leader_ids)),
        counter_names_(counter_names) {}
-  PerfCounters() = default;

  void CloseCounters() const;

@ -150,33 +149,25 @@ class BENCHMARK_EXPORT PerfCounters final {
 class BENCHMARK_EXPORT PerfCountersMeasurement final {
 public:
  PerfCountersMeasurement(const std::vector<std::string>& counter_names);
-  ~PerfCountersMeasurement();

-  // The only way to get to `counters_` is after ctor-ing a
-  // `PerfCountersMeasurement`, which means that `counters_`'s state is, here,
-  // decided (either invalid or valid) and won't change again even if a ctor is
-  // concurrently running with this. This is preferring efficiency to
-  // maintainability, because the address of the static can be known at compile
-  // time.
-  bool IsValid() const {
-    MutexLock l(mutex_);
-    return counters_.IsValid();
-  }
+  size_t num_counters() const { return counters_.num_counters(); }

-  BENCHMARK_ALWAYS_INLINE void Start() {
-    assert(IsValid());
-    MutexLock l(mutex_);
+  std::vector<std::string> names() const { return counters_.names(); }
+
+  BENCHMARK_ALWAYS_INLINE bool Start() {
+    if (num_counters() == 0) return true;
    // Tell the compiler to not move instructions above/below where we take
    // the snapshot.
    ClobberMemory();
    valid_read_ &= counters_.Snapshot(&start_values_);
    ClobberMemory();
+
+    return valid_read_;
  }

  BENCHMARK_ALWAYS_INLINE bool Stop(
      std::vector<std::pair<std::string, double>>& measurements) {
-    assert(IsValid());
-    MutexLock l(mutex_);
+    if (num_counters() == 0) return true;
    // Tell the compiler to not move instructions above/below where we take
    // the snapshot.
    ClobberMemory();
@ -193,9 +184,7 @@ class BENCHMARK_EXPORT PerfCountersMeasurement final {
  }

 private:
-  static Mutex mutex_;
-  GUARDED_BY(mutex_) static int ref_count_;
-  GUARDED_BY(mutex_) static PerfCounters counters_;
+  PerfCounters counters_;
  bool valid_read_ = true;
  PerfCounterValues start_values_;
  PerfCounterValues end_values_;
--- a/test/perf_counters_gtest.cc
+++ b/test/perf_counters_gtest.cc
@ -1,3 +1,4 @@
+#include <random>
 #include <thread>

 #include "../src/perf_counters.h"
@ -28,7 +29,7 @@ TEST(PerfCountersTest, OneCounter) {
    GTEST_SKIP() << "Performance counters not supported.\n";
  }
  EXPECT_TRUE(PerfCounters::Initialize());
-  EXPECT_TRUE(PerfCounters::Create({kGenericPerfEvent1}).IsValid());
+  EXPECT_EQ(PerfCounters::Create({kGenericPerfEvent1}).num_counters(), 1);
 }

 TEST(PerfCountersTest, NegativeTest) {
@ -37,38 +38,42 @@ TEST(PerfCountersTest, NegativeTest) {
    return;
  }
  EXPECT_TRUE(PerfCounters::Initialize());
-  EXPECT_FALSE(PerfCounters::Create({}).IsValid());
-  EXPECT_FALSE(PerfCounters::Create({""}).IsValid());
-  EXPECT_FALSE(PerfCounters::Create({"not a counter name"}).IsValid());
-  EXPECT_TRUE(PerfCounters::Create(
-                  {kGenericPerfEvent1, kGenericPerfEvent2, kGenericPerfEvent3})
-                  .IsValid());
+  // Sanity checks
+  // Create() will always create a valid object, even if passed no or
+  // wrong arguments as the new behavior is to warn and drop unsupported
+  // counters
+  EXPECT_EQ(PerfCounters::Create({}).num_counters(), 0);
+  EXPECT_EQ(PerfCounters::Create({""}).num_counters(), 0);
+  EXPECT_EQ(PerfCounters::Create({"not a counter name"}).num_counters(), 0);
  {
+    // Try sneaking in a bad egg to see if it is filtered out. The
+    // number of counters has to be two, not zero
    auto counter =
        PerfCounters::Create({kGenericPerfEvent2, "", kGenericPerfEvent1});
-    EXPECT_TRUE(counter.IsValid());
    EXPECT_EQ(counter.num_counters(), 2);
    EXPECT_EQ(counter.names(), std::vector<std::string>(
                                   {kGenericPerfEvent2, kGenericPerfEvent1}));
  }
  {
+    // Try sneaking in an outrageous counter, like a fat finger mistake
    auto counter = PerfCounters::Create(
        {kGenericPerfEvent3, "not a counter name", kGenericPerfEvent1});
-    EXPECT_TRUE(counter.IsValid());
    EXPECT_EQ(counter.num_counters(), 2);
    EXPECT_EQ(counter.names(), std::vector<std::string>(
                                   {kGenericPerfEvent3, kGenericPerfEvent1}));
  }
  {
-    EXPECT_TRUE(PerfCounters::Create({kGenericPerfEvent1, kGenericPerfEvent2,
-                                      kGenericPerfEvent3})
-                    .IsValid());
+    // Finally try a golden input - it should like all them
+    EXPECT_EQ(PerfCounters::Create(
+                  {kGenericPerfEvent1, kGenericPerfEvent2, kGenericPerfEvent3})
+                  .num_counters(),
+              3);
  }
  {
+    // Add a bad apple in the end of the chain to check the edges
    auto counter = PerfCounters::Create({kGenericPerfEvent1, kGenericPerfEvent2,
                                         kGenericPerfEvent3,
                                         "MISPREDICTED_BRANCH_RETIRED"});
-    EXPECT_TRUE(counter.IsValid());
    EXPECT_EQ(counter.num_counters(), 3);
    EXPECT_EQ(counter.names(),
              std::vector<std::string>({kGenericPerfEvent1, kGenericPerfEvent2,
@ -82,7 +87,7 @@ TEST(PerfCountersTest, Read1Counter) {
  }
  EXPECT_TRUE(PerfCounters::Initialize());
  auto counters = PerfCounters::Create({kGenericPerfEvent1});
-  EXPECT_TRUE(counters.IsValid());
+  EXPECT_EQ(counters.num_counters(), 1);
  PerfCounterValues values1(1);
  EXPECT_TRUE(counters.Snapshot(&values1));
  EXPECT_GT(values1[0], 0);
@ -99,7 +104,7 @@ TEST(PerfCountersTest, Read2Counters) {
  EXPECT_TRUE(PerfCounters::Initialize());
  auto counters =
      PerfCounters::Create({kGenericPerfEvent1, kGenericPerfEvent2});
-  EXPECT_TRUE(counters.IsValid());
+  EXPECT_EQ(counters.num_counters(), 2);
  PerfCounterValues values1(2);
  EXPECT_TRUE(counters.Snapshot(&values1));
  EXPECT_GT(values1[0], 0);
@ -111,62 +116,107 @@ TEST(PerfCountersTest, Read2Counters) {
 }

 TEST(PerfCountersTest, ReopenExistingCounters) {
-  // The test works (i.e. causes read to fail) for the assumptions
-  // about hardware capabilities (i.e. small number (3-4) hardware
-  // counters) at this date.
+  // This test works in recent and old Intel hardware
+  // However we cannot make assumptions beyond 3 HW counters
  if (!PerfCounters::kSupported) {
    GTEST_SKIP() << "Test skipped because libpfm is not supported.\n";
  }
  EXPECT_TRUE(PerfCounters::Initialize());
-  std::vector<PerfCounters> counters;
-  counters.reserve(6);
-  for (int i = 0; i < 6; i++)
-    counters.push_back(PerfCounters::Create({kGenericPerfEvent1}));
+  std::vector<std::string> kMetrics({kGenericPerfEvent1});
+  std::vector<PerfCounters> counters(3);
+  for (auto& counter : counters) {
+    counter = PerfCounters::Create(kMetrics);
+  }
  PerfCounterValues values(1);
  EXPECT_TRUE(counters[0].Snapshot(&values));
-  EXPECT_FALSE(counters[4].Snapshot(&values));
-  EXPECT_FALSE(counters[5].Snapshot(&values));
+  EXPECT_TRUE(counters[1].Snapshot(&values));
+  EXPECT_TRUE(counters[2].Snapshot(&values));
 }

 TEST(PerfCountersTest, CreateExistingMeasurements) {
  // The test works (i.e. causes read to fail) for the assumptions
-  // about hardware capabilities (i.e. small number (3-4) hardware
+  // about hardware capabilities (i.e. small number (3) hardware
  // counters) at this date,
  // the same as previous test ReopenExistingCounters.
  if (!PerfCounters::kSupported) {
    GTEST_SKIP() << "Test skipped because libpfm is not supported.\n";
  }
  EXPECT_TRUE(PerfCounters::Initialize());
-  std::vector<PerfCountersMeasurement> perf_counter_measurements;
+
+  // This means we will try 10 counters but we can only guarantee
+  // for sure at this time that only 3 will work. Perhaps in the future
+  // we could use libpfm to query for the hardware limits on this
+  // particular platform.
+  const int kMaxCounters = 10;
+  const int kMinValidCounters = 3;
+
+  // Let's use a ubiquitous counter that is guaranteed to work
+  // on all platforms
+  const std::vector<std::string> kMetrics{"cycles"};
+
+  // Cannot create a vector of actual objects because the
+  // copy constructor of PerfCounters is deleted - and so is
+  // implicitly deleted on PerfCountersMeasurement too
+  std::vector<std::unique_ptr<PerfCountersMeasurement>>
+      perf_counter_measurements;
+
+  perf_counter_measurements.reserve(kMaxCounters);
+  for (int j = 0; j < kMaxCounters; ++j) {
+    perf_counter_measurements.emplace_back(
+        new PerfCountersMeasurement(kMetrics));
+  }
+
  std::vector<std::pair<std::string, double>> measurements;

-  perf_counter_measurements.reserve(10);
-  for (int i = 0; i < 10; i++)
-    perf_counter_measurements.emplace_back(
-        std::vector<std::string>{kGenericPerfEvent1});
+  // Start all counters together to see if they hold
+  int max_counters = kMaxCounters;
+  for (int i = 0; i < kMaxCounters; ++i) {
+    auto& counter(*perf_counter_measurements[i]);
+    EXPECT_EQ(counter.num_counters(), 1);
+    if (!counter.Start()) {
+      max_counters = i;
+      break;
+    };
+  }

-  perf_counter_measurements[0].Start();
-  EXPECT_TRUE(perf_counter_measurements[0].Stop(measurements));
+  ASSERT_GE(max_counters, kMinValidCounters);

-  measurements.clear();
-  perf_counter_measurements[8].Start();
-  EXPECT_FALSE(perf_counter_measurements[8].Stop(measurements));
+  // Start all together
+  for (int i = 0; i < max_counters; ++i) {
+    auto& counter(*perf_counter_measurements[i]);
+    EXPECT_TRUE(counter.Stop(measurements) || (i >= kMinValidCounters));
+  }

-  measurements.clear();
-  perf_counter_measurements[9].Start();
-  EXPECT_FALSE(perf_counter_measurements[9].Stop(measurements));
+  // Start/stop individually
+  for (int i = 0; i < max_counters; ++i) {
+    auto& counter(*perf_counter_measurements[i]);
+    measurements.clear();
+    counter.Start();
+    EXPECT_TRUE(counter.Stop(measurements) || (i >= kMinValidCounters));
+  }
 }

-size_t do_work() {
-  size_t res = 0;
-  for (size_t i = 0; i < 100000000; ++i) res += i * i;
-  return res;
+// We try to do some meaningful work here but the compiler
+// insists in optimizing away our loop so we had to add a
+// no-optimize macro. In case it fails, we added some entropy
+// to this pool as well.
+
+BENCHMARK_DONT_OPTIMIZE size_t do_work() {
+  static std::mt19937 rd{std::random_device{}()};
+  static std::uniform_int_distribution<size_t> mrand(0, 10);
+  const size_t kNumLoops = 1000000;
+  size_t sum = 0;
+  for (size_t j = 0; j < kNumLoops; ++j) {
+    sum += mrand(rd);
+  }
+  benchmark::DoNotOptimize(sum);
+  return sum;
 }

-void measure(size_t threadcount, PerfCounterValues* values1,
-             PerfCounterValues* values2) {
-  BM_CHECK_NE(values1, nullptr);
-  BM_CHECK_NE(values2, nullptr);
+void measure(size_t threadcount, PerfCounterValues* before,
+             PerfCounterValues* after) {
+  BM_CHECK_NE(before, nullptr);
+  BM_CHECK_NE(after, nullptr);
  std::vector<std::thread> threads(threadcount);
  auto work = [&]() { BM_CHECK(do_work() > 1000); };

@ -178,9 +228,9 @@ void measure(size_t threadcount, PerfCounterValues* values1,
  auto counters =
      PerfCounters::Create({kGenericPerfEvent1, kGenericPerfEvent3});
  for (auto& t : threads) t = std::thread(work);
-  counters.Snapshot(values1);
+  counters.Snapshot(before);
  for (auto& t : threads) t.join();
-  counters.Snapshot(values2);
+  counters.Snapshot(after);
 }

 TEST(PerfCountersTest, MultiThreaded) {
@ -188,21 +238,29 @@ TEST(PerfCountersTest, MultiThreaded) {
    GTEST_SKIP() << "Test skipped because libpfm is not supported.";
  }
  EXPECT_TRUE(PerfCounters::Initialize());
-  PerfCounterValues values1(2);
-  PerfCounterValues values2(2);
+  PerfCounterValues before(2);
+  PerfCounterValues after(2);

-  measure(2, &values1, &values2);
-  std::vector<double> D1{static_cast<double>(values2[0] - values1[0]),
-                         static_cast<double>(values2[1] - values1[1])};
+  // Notice that this test will work even if we taskset it to a single CPU
+  // In this case the threads will run sequentially
+  // Start two threads and measure the number of combined cycles and
+  // instructions
+  measure(2, &before, &after);
+  std::vector<double> Elapsed2Threads{
+      static_cast<double>(after[0] - before[0]),
+      static_cast<double>(after[1] - before[1])};

-  measure(4, &values1, &values2);
-  std::vector<double> D2{static_cast<double>(values2[0] - values1[0]),
-                         static_cast<double>(values2[1] - values1[1])};
+  // Start four threads and measure the number of combined cycles and
+  // instructions
+  measure(4, &before, &after);
+  std::vector<double> Elapsed4Threads{
+      static_cast<double>(after[0] - before[0]),
+      static_cast<double>(after[1] - before[1])};

  // Some extra work will happen on the main thread - like joining the threads
  // - so the ratio won't be quite 2.0, but very close.
-  EXPECT_GE(D2[0], 1.9 * D1[0]);
-  EXPECT_GE(D2[1], 1.9 * D1[1]);
+  EXPECT_GE(Elapsed4Threads[0], 1.9 * Elapsed2Threads[0]);
+  EXPECT_GE(Elapsed4Threads[1], 1.9 * Elapsed2Threads[1]);
 }

 TEST(PerfCountersTest, HardwareLimits) {