Cache PerfCounters instance in PerfCountersMeasurement (#1308)

This patch fixes #1306, by reducing the pinned instances of PerfCounters. The issue is caused by creating multiple pinned events in the same thread, doing so results in the Snapshot(PerfCounterValues* values) failing, and that's now discoverable. Creating multile pinned events is an unsupported behavior currently. The error would be detected at read() time, not perf_event_open() / iotcl() time. The unsupported benavior above is confirmed by Stephane Eranian @seranian, and he also pointed the dectection method. Finished this patch under the guidance of Mircea Trofin @mtrofin.
2022-01-25 19:14:20 +09:00 · 2022-01-25 19:14:20 +09:00 · d0fbf8ac23
parent 57b2bfa33b
commit d0fbf8ac23
5 changed files with 123 additions and 25 deletions
--- a/src/benchmark.cc
+++ b/src/benchmark.cc
@ -184,7 +184,10 @@ void State::PauseTiming() {
  BM_CHECK(started_ && !finished_ && !error_occurred_);
  timer_->StopTimer();
  if (perf_counters_measurement_) {
-    auto measurements = perf_counters_measurement_->StopAndGetMeasurements();
+    std::vector<std::pair<std::string, double>> measurements;
+    if (!perf_counters_measurement_->Stop(measurements)) {
+      BM_CHECK(false) << "Perf counters read the value failed.";
+    }
    for (const auto& name_and_measurement : measurements) {
      auto name = name_and_measurement.first;
      auto measurement = name_and_measurement.second;
--- a/src/benchmark_runner.cc
+++ b/src/benchmark_runner.cc
@ -152,8 +152,7 @@ BenchmarkRunner::BenchmarkRunner(
      has_explicit_iteration_count(b.iterations() != 0),
      pool(b.threads() - 1),
      iters(has_explicit_iteration_count ? b.iterations() : 1),
-      perf_counters_measurement(
-          PerfCounters::Create(StrSplit(FLAGS_benchmark_perf_counters, ','))),
+      perf_counters_measurement(StrSplit(FLAGS_benchmark_perf_counters, ',')),
      perf_counters_measurement_ptr(perf_counters_measurement.IsValid()
                                        ? &perf_counters_measurement
                                        : nullptr) {
--- a/src/perf_counters.cc
+++ b/src/perf_counters.cc
@ -15,6 +15,7 @@
 #include "perf_counters.h"

 #include <cstring>
+#include <memory>
 #include <vector>

 #if defined HAVE_LIBPFM
@ -104,7 +105,7 @@ PerfCounters PerfCounters::Create(
  return PerfCounters(counter_names, std::move(counter_ids));
 }

-PerfCounters::~PerfCounters() {
+void PerfCounters::CloseCounters() const {
  if (counter_ids_.empty()) {
    return;
  }
@ -126,7 +127,44 @@ PerfCounters PerfCounters::Create(
  return NoCounters();
 }

-PerfCounters::~PerfCounters() = default;
+void PerfCounters::CloseCounters() const {}
 #endif  // defined HAVE_LIBPFM
+
+Mutex PerfCountersMeasurement::mutex_;
+int PerfCountersMeasurement::ref_count_ = 0;
+PerfCounters PerfCountersMeasurement::counters_ = PerfCounters::NoCounters();
+
+PerfCountersMeasurement::PerfCountersMeasurement(
+    const std::vector<std::string>& counter_names)
+    : start_values_(counter_names.size()), end_values_(counter_names.size()) {
+  MutexLock l(mutex_);
+  if (ref_count_ == 0) {
+    counters_ = PerfCounters::Create(counter_names);
+  }
+  // We chose to increment it even if `counters_` ends up invalid,
+  // so that we don't keep trying to create, and also since the dtor
+  // will decrement regardless of `counters_`'s validity
+  ++ref_count_;
+
+  BM_CHECK(!counters_.IsValid() || counters_.names() == counter_names);
+}
+
+PerfCountersMeasurement::~PerfCountersMeasurement() {
+  MutexLock l(mutex_);
+  --ref_count_;
+  if (ref_count_ == 0) {
+    counters_ = PerfCounters::NoCounters();
+  }
+}
+
+PerfCounters& PerfCounters::operator=(PerfCounters&& other) noexcept {
+  if (this != &other) {
+    CloseCounters();
+
+    counter_ids_ = std::move(other.counter_ids_);
+    counter_names_ = std::move(other.counter_names_);
+  }
+  return *this;
+}
 }  // namespace internal
 }  // namespace benchmark
--- a/src/perf_counters.h
+++ b/src/perf_counters.h
@ -17,11 +17,13 @@

 #include <array>
 #include <cstdint>
+#include <memory>
 #include <vector>

 #include "benchmark/benchmark.h"
 #include "check.h"
 #include "log.h"
+#include "mutex.h"

 #ifndef BENCHMARK_OS_WINDOWS
 #include <unistd.h>
@ -71,12 +73,14 @@ class PerfCounters final {
  // True iff this platform supports performance counters.
  static const bool kSupported;

-  bool IsValid() const { return is_valid_; }
+  bool IsValid() const { return !counter_names_.empty(); }
  static PerfCounters NoCounters() { return PerfCounters(); }

-  ~PerfCounters();
+  ~PerfCounters() { CloseCounters(); }
  PerfCounters(PerfCounters&&) = default;
  PerfCounters(const PerfCounters&) = delete;
+  PerfCounters& operator=(PerfCounters&&) noexcept;
+  PerfCounters& operator=(const PerfCounters&) = delete;

  // Platform-specific implementations may choose to do some library
  // initialization here.
@ -111,24 +115,27 @@ class PerfCounters final {
 private:
  PerfCounters(const std::vector<std::string>& counter_names,
               std::vector<int>&& counter_ids)
-      : counter_ids_(std::move(counter_ids)),
-        counter_names_(counter_names),
-        is_valid_(true) {}
-  PerfCounters() : is_valid_(false) {}
+      : counter_ids_(std::move(counter_ids)), counter_names_(counter_names) {}
+  PerfCounters() = default;
+
+  void CloseCounters() const;

  std::vector<int> counter_ids_;
-  const std::vector<std::string> counter_names_;
-  const bool is_valid_;
+  std::vector<std::string> counter_names_;
 };

 // Typical usage of the above primitives.
 class PerfCountersMeasurement final {
 public:
-  PerfCountersMeasurement(PerfCounters&& c)
-      : counters_(std::move(c)),
-        start_values_(counters_.IsValid() ? counters_.names().size() : 0),
-        end_values_(counters_.IsValid() ? counters_.names().size() : 0) {}
+  PerfCountersMeasurement(const std::vector<std::string>& counter_names);
+  ~PerfCountersMeasurement();

+  // The only way to get to `counters_` is after ctor-ing a
+  // `PerfCountersMeasurement`, which means that `counters_`'s state is, here,
+  // decided (either invalid or valid) and won't change again even if a ctor is
+  // concurrently running with this. This is preferring efficiency to
+  // maintainability, because the address of the static can be known at compile
+  // time.
  bool IsValid() const { return counters_.IsValid(); }

  BENCHMARK_ALWAYS_INLINE void Start() {
@ -136,30 +143,33 @@ class PerfCountersMeasurement final {
    // Tell the compiler to not move instructions above/below where we take
    // the snapshot.
    ClobberMemory();
-    counters_.Snapshot(&start_values_);
+    valid_read_ &= counters_.Snapshot(&start_values_);
    ClobberMemory();
  }

-  BENCHMARK_ALWAYS_INLINE std::vector<std::pair<std::string, double>>
-  StopAndGetMeasurements() {
+  BENCHMARK_ALWAYS_INLINE bool Stop(
+      std::vector<std::pair<std::string, double>>& measurements) {
    assert(IsValid());
    // Tell the compiler to not move instructions above/below where we take
    // the snapshot.
    ClobberMemory();
-    counters_.Snapshot(&end_values_);
+    valid_read_ &= counters_.Snapshot(&end_values_);
    ClobberMemory();

-    std::vector<std::pair<std::string, double>> ret;
    for (size_t i = 0; i < counters_.names().size(); ++i) {
      double measurement = static_cast<double>(end_values_[i]) -
                           static_cast<double>(start_values_[i]);
-      ret.push_back({counters_.names()[i], measurement});
+      measurements.push_back({counters_.names()[i], measurement});
    }
-    return ret;
+
+    return valid_read_;
  }

 private:
-  PerfCounters counters_;
+  static Mutex mutex_;
+  GUARDED_BY(mutex_) static int ref_count_;
+  GUARDED_BY(mutex_) static PerfCounters counters_;
+  bool valid_read_ = true;
  PerfCounterValues start_values_;
  PerfCounterValues end_values_;
 };
--- a/test/perf_counters_gtest.cc
+++ b/test/perf_counters_gtest.cc
@ -11,6 +11,7 @@ struct MsgHandler {
 #endif

 using benchmark::internal::PerfCounters;
+using benchmark::internal::PerfCountersMeasurement;
 using benchmark::internal::PerfCounterValues;

 namespace {
@ -95,6 +96,53 @@ TEST(PerfCountersTest, Read2Counters) {
  EXPECT_GT(values2[1], 0);
 }

+TEST(PerfCountersTest, ReopenExistingCounters) {
+  // The test works (i.e. causes read to fail) for the assumptions
+  // about hardware capabilities (i.e. small number (3-4) hardware
+  // counters) at this date.
+  if (!PerfCounters::kSupported) {
+    GTEST_SKIP() << "Test skipped because libpfm is not supported.\n";
+  }
+  EXPECT_TRUE(PerfCounters::Initialize());
+  std::vector<PerfCounters> counters;
+  counters.reserve(6);
+  for (int i = 0; i < 6; i++)
+    counters.push_back(PerfCounters::Create({kGenericPerfEvent1}));
+  PerfCounterValues values(1);
+  EXPECT_TRUE(counters[0].Snapshot(&values));
+  EXPECT_FALSE(counters[4].Snapshot(&values));
+  EXPECT_FALSE(counters[5].Snapshot(&values));
+}
+
+TEST(PerfCountersTest, CreateExistingMeasurements) {
+  // The test works (i.e. causes read to fail) for the assumptions
+  // about hardware capabilities (i.e. small number (3-4) hardware
+  // counters) at this date,
+  // the same as previous test ReopenExistingCounters.
+  if (!PerfCounters::kSupported) {
+    GTEST_SKIP() << "Test skipped because libpfm is not supported.\n";
+  }
+  EXPECT_TRUE(PerfCounters::Initialize());
+  std::vector<PerfCountersMeasurement> perf_counter_measurements;
+  std::vector<std::pair<std::string, double>> measurements;
+
+  perf_counter_measurements.reserve(10);
+  for (int i = 0; i < 10; i++)
+    perf_counter_measurements.emplace_back(
+        std::vector<std::string>{kGenericPerfEvent1});
+
+  perf_counter_measurements[0].Start();
+  EXPECT_TRUE(perf_counter_measurements[0].Stop(measurements));
+
+  measurements.clear();
+  perf_counter_measurements[8].Start();
+  EXPECT_FALSE(perf_counter_measurements[8].Stop(measurements));
+
+  measurements.clear();
+  perf_counter_measurements[9].Start();
+  EXPECT_FALSE(perf_counter_measurements[9].Stop(measurements));
+}
+
 size_t do_work() {
  size_t res = 0;
  for (size_t i = 0; i < 100000000; ++i) res += i * i;