From 376ebc26354ca2b79af94467133f3c35b539627e Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin@google.com>
Date: Wed, 28 Apr 2021 01:25:29 -0700
Subject: [PATCH] Support optional, user-directed collection of performance
 counters (#1114)

* Support optional, user-directed collection of performance counters

The patch allows an engineer wishing to drill into the root causes
of a regression, for example. Currently, only single threaded runs
are supported. The feature is a build-time opt in, and then a runtime
opt in.

The engineer may run the benchmark executable, passing a list of
performance counter names (using libpfm's naming scheme) at the
command line. The counter values will then be collected and reported
back as UserCounters.

This is different from #240 in that it is a benchmark user opt-in, and
the counter collection is transparent to the benchmark.

Currently, this is only supported on platforms where libpfm is
supported.

libpfm: http://perfmon2.sourceforge.net/

* 'Use' values param in Snapshot when BENCHMARK_OS_WINDOWS

This is to avoid unused parameter warning-as-error

* Added missing include for <vector> in perf_counters.cc

* Moved doc to docs

* Added license blurbs
---
 .../workflows/build-and-test-perfcounters.yml |  44 +++++
 CMakeLists.txt                                |   4 +
 README.md                                     |   2 +
 cmake/Modules/FindPFM.cmake                   |  19 ++
 docs/perf_counters.md                         |  35 ++++
 include/benchmark/benchmark.h                 |   9 +-
 src/CMakeLists.txt                            |   6 +
 src/benchmark.cc                              |  27 ++-
 src/benchmark_api_internal.cc                 |  10 +-
 src/benchmark_api_internal.h                  |   3 +-
 src/benchmark_runner.cc                       |  29 ++-
 src/benchmark_runner.h                        |   2 +
 src/perf_counters.cc                          | 128 +++++++++++++
 src/perf_counters.h                           | 172 ++++++++++++++++++
 src/string_util.cc                            |  12 ++
 src/string_util.h                             |   2 +
 test/CMakeLists.txt                           |   4 +
 test/perf_counters_gtest.cc                   |  95 ++++++++++
 test/perf_counters_test.cc                    |  27 +++
 test/string_util_gtest.cc                     |   8 +
 20 files changed, 621 insertions(+), 17 deletions(-)
 create mode 100644 .github/workflows/build-and-test-perfcounters.yml
 create mode 100644 cmake/Modules/FindPFM.cmake
 create mode 100644 docs/perf_counters.md
 create mode 100644 src/perf_counters.cc
 create mode 100644 src/perf_counters.h
 create mode 100644 test/perf_counters_gtest.cc
 create mode 100644 test/perf_counters_test.cc
diff --git a/.github/workflows/build-and-test-perfcounters.yml b/.github/workflows/build-and-test-perfcounters.yml
new file mode 100644
index 00000000..dfb88cbc
--- /dev/null
+++ b/.github/workflows/build-and-test-perfcounters.yml
@@ -0,0 +1,44 @@
+name: build-and-test-perfcounters
+
+on:
+  push:
+    branches: [ master ]
+  pull_request:
+    branches: [ master ]
+
+jobs:
+  job:
+    # TODO(dominic): Extend this to include compiler and set through env: CC/CXX.
+    name: ${{ matrix.os }}.${{ matrix.build_type }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest, ubuntu-16.04, ubuntu-20.04]
+        build_type: ['Release', 'Debug']
+    steps:
+    - uses: actions/checkout@v2
+
+    - name: install libpfm
+      run: sudo apt install libpfm4-dev
+
+    - name: create build environment
+      run: cmake -E make_directory ${{ runner.workspace }}/_build
+
+    - name: configure cmake
+      shell: bash
+      working-directory: ${{ runner.workspace }}/_build
+      run: cmake  -DBENCHMARK_ENABLE_LIBPFM=1 -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
+
+    - name: build
+      shell: bash
+      working-directory: ${{ runner.workspace }}/_build
+      run: cmake --build . --config ${{ matrix.build_type }}
+
+    # Skip testing, for now. It seems perf_event_open does not succeed on the
+    # hosting machine, very likely a permissions issue.
+    # TODO(mtrofin): Enable test.
+    # - name: test
+    #   shell: bash
+    #   working-directory: ${{ runner.workspace }}/_build
+    #   run: sudo ctest -C ${{ matrix.build_type }} --rerun-failed --output-on-failure
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 10072545..7e0f251f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -270,6 +270,10 @@ cxx_feature_check(STEADY_CLOCK)
 set(THREADS_PREFER_PTHREAD_FLAG ON)
 find_package(Threads REQUIRED)
 
+if (BENCHMARK_ENABLE_LIBPFM)
+  find_package(PFM)  
+endif()
+
 # Set up directories
 include_directories(${PROJECT_SOURCE_DIR}/include)
 
diff --git a/README.md b/README.md
index ceedb334..e8c65f8d 100644
--- a/README.md
+++ b/README.md
@@ -297,6 +297,8 @@ too (`-lkstat`).
 
 [Setting the Time Unit](#setting-the-time-unit)
 
+[User-Requested Performance Counters](docs/perf_counters.md)
+
 [Preventing Optimization](#preventing-optimization)
 
 [Reporting Statistics](#reporting-statistics)
diff --git a/cmake/Modules/FindPFM.cmake b/cmake/Modules/FindPFM.cmake
new file mode 100644
index 00000000..553d458c
--- /dev/null
+++ b/cmake/Modules/FindPFM.cmake
@@ -0,0 +1,19 @@
+# If successful, the following variables will be defined:
+# HAVE_LIBPFM.
+# Set BENCHMARK_ENABLE_LIBPFM to 0 to disable, regardless of libpfm presence.
+include(CheckIncludeFile)
+include(CheckLibraryExists)
+enable_language(C)
+
+check_library_exists(libpfm.a pfm_initialize "" HAVE_LIBPFM_INITIALIZE)
+if(HAVE_LIBPFM_INITIALIZE)
+  check_include_file(perfmon/perf_event.h HAVE_PERFMON_PERF_EVENT_H)
+  check_include_file(perfmon/pfmlib.h HAVE_PERFMON_PFMLIB_H)
+  check_include_file(perfmon/pfmlib_perf_event.h HAVE_PERFMON_PFMLIB_PERF_EVENT_H)
+  if(HAVE_PERFMON_PERF_EVENT_H AND HAVE_PERFMON_PFMLIB_H AND HAVE_PERFMON_PFMLIB_PERF_EVENT_H)
+    message("Using Perf Counters.")
+    set(HAVE_LIBPFM 1)
+  endif()
+else()
+  message("Perf Counters support requested, but was unable to find libpfm.")
+endif()
diff --git a/docs/perf_counters.md b/docs/perf_counters.md
new file mode 100644
index 00000000..43ff4517
--- /dev/null
+++ b/docs/perf_counters.md
@@ -0,0 +1,35 @@
+<a name="perf-counters" />
+
+# User-Requested Performance Counters
+
+When running benchmarks, the user may choose to request collection of
+performance counters. This may be useful in investigation scenarios - narrowing
+down the cause of a regression; or verifying that the underlying cause of a
+performance improvement matches expectations.
+
+This feature is available if:
+
+* The benchmark is run on an architecture featuring a Performance Monitoring
+  Unit (PMU),
+* The benchmark is compiled with support for collecting counters. Currently,
+  this requires [libpfm](http://perfmon2.sourceforge.net/) be available at build
+  time, and
+* Currently, there is a limitation that the benchmark be run on one thread.
+
+The feature does not require modifying benchmark code. Counter collection is
+handled at the boundaries where timer collection is also handled. 
+
+To opt-in:
+
+*  Install `libpfm4-dev`, e.g. `apt-get install libpfm4-dev`.
+*  Enable the cmake flag BENCHMARK_ENABLE_LIBPFM.
+
+To use, pass a comma-separated list of counter names through the
+`--benchmark_perf_counters` flag. The names are decoded through libpfm - meaning,
+they are platform specific, but some (e.g. `CYCLES` or `INSTRUCTIONS`) are
+mapped by libpfm to platform-specifics - see libpfm
+[documentation](http://perfmon2.sourceforge.net/docs.html) for more details.
+
+The counter values are reported back through the [User Counters](../README.md#custom-counters)
+mechanism, meaning, they are available in all the formats (e.g. JSON) supported
+by User Counters.
\ No newline at end of file
diff --git a/include/benchmark/benchmark.h b/include/benchmark/benchmark.h
index 881ce9e5..664422b3 100644
--- a/include/benchmark/benchmark.h
+++ b/include/benchmark/benchmark.h
@@ -448,6 +448,7 @@ struct Statistics {
 struct BenchmarkInstance;
 class ThreadTimer;
 class ThreadManager;
+class PerfCountersMeasurement;
 
 enum AggregationReportMode
 #if defined(BENCHMARK_HAS_CXX11)
@@ -687,15 +688,17 @@ class State {
  private:
   State(IterationCount max_iters, const std::vector<int64_t>& ranges,
         int thread_i, int n_threads, internal::ThreadTimer* timer,
-        internal::ThreadManager* manager);
+        internal::ThreadManager* manager,
+        internal::PerfCountersMeasurement* perf_counters_measurement);
 
   void StartKeepRunning();
   // Implementation of KeepRunning() and KeepRunningBatch().
   // is_batch must be true unless n is 1.
   bool KeepRunningInternal(IterationCount n, bool is_batch);
   void FinishKeepRunning();
-  internal::ThreadTimer* timer_;
-  internal::ThreadManager* manager_;
+  internal::ThreadTimer* const timer_;
+  internal::ThreadManager* const manager_;
+  internal::PerfCountersMeasurement* const perf_counters_measurement_;
 
   friend struct internal::BenchmarkInstance;
 };
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 35d559ee..a6c8e9a7 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -28,6 +28,12 @@ target_include_directories(benchmark PUBLIC
     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include>
     )
 
+# libpfm, if available
+if (HAVE_LIBPFM)
+  target_link_libraries(benchmark libpfm.a)
+  add_definitions(-DHAVE_LIBPFM)
+endif()
+
 # Link threads.
 target_link_libraries(benchmark  ${BENCHMARK_CXX_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
 find_library(LIBRT rt)
diff --git a/src/benchmark.cc b/src/benchmark.cc
index ffe4bf45..1fea654c 100644
--- a/src/benchmark.cc
+++ b/src/benchmark.cc
@@ -45,6 +45,7 @@
 #include "internal_macros.h"
 #include "log.h"
 #include "mutex.h"
+#include "perf_counters.h"
 #include "re.h"
 #include "statistics.h"
 #include "string_util.h"
@@ -106,6 +107,10 @@ DEFINE_bool(benchmark_counters_tabular, false);
 // The level of verbose logging to output
 DEFINE_int32(v, 0);
 
+// List of additional perf counters to collect, in libpfm format. For more
+// information about libpfm: https://man7.org/linux/man-pages/man3/libpfm.3.html
+DEFINE_string(benchmark_perf_counters, "");
+
 namespace benchmark {
 
 namespace internal {
@@ -117,7 +122,8 @@ void UseCharPointer(char const volatile*) {}
 
 State::State(IterationCount max_iters, const std::vector<int64_t>& ranges,
              int thread_i, int n_threads, internal::ThreadTimer* timer,
-             internal::ThreadManager* manager)
+             internal::ThreadManager* manager,
+             internal::PerfCountersMeasurement* perf_counters_measurement)
     : total_iterations_(0),
       batch_leftover_(0),
       max_iterations(max_iters),
@@ -130,7 +136,8 @@ State::State(IterationCount max_iters, const std::vector<int64_t>& ranges,
       thread_index(thread_i),
       threads(n_threads),
       timer_(timer),
-      manager_(manager) {
+      manager_(manager),
+      perf_counters_measurement_(perf_counters_measurement) {
   CHECK(max_iterations != 0) << "At least one iteration must be run";
   CHECK_LT(thread_index, threads) << "thread_index must be less than threads";
 
@@ -163,11 +170,23 @@ void State::PauseTiming() {
   // Add in time accumulated so far
   CHECK(started_ && !finished_ && !error_occurred_);
   timer_->StopTimer();
+  if (perf_counters_measurement_) {
+    auto measurements = perf_counters_measurement_->StopAndGetMeasurements();
+    for (const auto& name_and_measurement : measurements) {
+      auto name = name_and_measurement.first;
+      auto measurement = name_and_measurement.second;
+      CHECK_EQ(counters[name], 0.0);
+      counters[name] = Counter(measurement, Counter::kAvgIterations);
+    }
+  }
 }
 
 void State::ResumeTiming() {
   CHECK(started_ && !finished_ && !error_occurred_);
   timer_->StartTimer();
+  if (perf_counters_measurement_) {
+    perf_counters_measurement_->Start();
+  }
 }
 
 void State::SkipWithError(const char* msg) {
@@ -457,7 +476,9 @@ void ParseCommandLineFlags(int* argc, char** argv) {
         ParseStringFlag(argv[i], "color_print", &FLAGS_benchmark_color) ||
         ParseBoolFlag(argv[i], "benchmark_counters_tabular",
                       &FLAGS_benchmark_counters_tabular) ||
-        ParseInt32Flag(argv[i], "v", &FLAGS_v)) {
+        ParseInt32Flag(argv[i], "v", &FLAGS_v) ||
+        ParseStringFlag(argv[i], "benchmark_perf_counters",
+                        &FLAGS_benchmark_perf_counters)) {
       for (int j = i; j != *argc - 1; ++j) argv[j] = argv[j + 1];
 
       --(*argc);
diff --git a/src/benchmark_api_internal.cc b/src/benchmark_api_internal.cc
index d468a257..804ef894 100644
--- a/src/benchmark_api_internal.cc
+++ b/src/benchmark_api_internal.cc
@@ -3,10 +3,12 @@
 namespace benchmark {
 namespace internal {
 
-State BenchmarkInstance::Run(IterationCount iters, int thread_id,
-                             internal::ThreadTimer* timer,
-                             internal::ThreadManager* manager) const {
-  State st(iters, arg, thread_id, threads, timer, manager);
+State BenchmarkInstance::Run(
+    IterationCount iters, int thread_id, internal::ThreadTimer* timer,
+    internal::ThreadManager* manager,
+    internal::PerfCountersMeasurement* perf_counters_measurement) const {
+  State st(iters, arg, thread_id, threads, timer, manager,
+           perf_counters_measurement);
   benchmark->Run(st);
   return st;
 }
diff --git a/src/benchmark_api_internal.h b/src/benchmark_api_internal.h
index 264eff95..b740bce1 100644
--- a/src/benchmark_api_internal.h
+++ b/src/benchmark_api_internal.h
@@ -36,7 +36,8 @@ struct BenchmarkInstance {
   int threads;  // Number of concurrent threads to us
 
   State Run(IterationCount iters, int thread_id, internal::ThreadTimer* timer,
-            internal::ThreadManager* manager) const;
+            internal::ThreadManager* manager,
+            internal::PerfCountersMeasurement* perf_counters_measurement) const;
 };
 
 bool FindBenchmarksInternal(const std::string& re,
diff --git a/src/benchmark_runner.cc b/src/benchmark_runner.cc
index d081aa86..083d1849 100644
--- a/src/benchmark_runner.cc
+++ b/src/benchmark_runner.cc
@@ -45,6 +45,7 @@
 #include "internal_macros.h"
 #include "log.h"
 #include "mutex.h"
+#include "perf_counters.h"
 #include "re.h"
 #include "statistics.h"
 #include "string_util.h"
@@ -111,12 +112,14 @@ BenchmarkReporter::Run CreateRunReport(
 // Execute one thread of benchmark b for the specified number of iterations.
 // Adds the stats collected for the thread into manager->results.
 void RunInThread(const BenchmarkInstance* b, IterationCount iters,
-                 int thread_id, ThreadManager* manager) {
+                 int thread_id, ThreadManager* manager,
+                 PerfCountersMeasurement* perf_counters_measurement) {
   internal::ThreadTimer timer(
       b->measure_process_cpu_time
           ? internal::ThreadTimer::CreateProcessCpuTime()
           : internal::ThreadTimer::Create());
-  State st = b->Run(iters, thread_id, &timer, manager);
+  State st =
+      b->Run(iters, thread_id, &timer, manager, perf_counters_measurement);
   CHECK(st.error_occurred() || st.iterations() >= st.max_iterations)
       << "Benchmark returned before State::KeepRunning() returned false!";
   {
@@ -143,7 +146,12 @@ class BenchmarkRunner {
                                    : FLAGS_benchmark_repetitions),
         has_explicit_iteration_count(b.iterations != 0),
         pool(b.threads - 1),
-        iters(has_explicit_iteration_count ? b.iterations : 1) {
+        iters(has_explicit_iteration_count ? b.iterations : 1),
+        perf_counters_measurement(
+            PerfCounters::Create(StrSplit(FLAGS_benchmark_perf_counters, ','))),
+        perf_counters_measurement_ptr(perf_counters_measurement.IsValid()
+                                          ? &perf_counters_measurement
+                                          : nullptr) {
     run_results.display_report_aggregates_only =
         (FLAGS_benchmark_report_aggregates_only ||
          FLAGS_benchmark_display_aggregates_only);
@@ -155,6 +163,11 @@ class BenchmarkRunner {
            internal::ARM_DisplayReportAggregatesOnly);
       run_results.file_report_aggregates_only =
           (b.aggregation_report_mode & internal::ARM_FileReportAggregatesOnly);
+      CHECK(b.threads == 1 || !perf_counters_measurement.IsValid())
+          << "Perf counters are not supported in multi-threaded cases.\n";
+      CHECK(FLAGS_benchmark_perf_counters.empty() ||
+            perf_counters_measurement.IsValid())
+          << "Perf counters were requested but could not be set up.";
     }
 
     for (int repetition_num = 0; repetition_num < repeats; repetition_num++) {
@@ -192,6 +205,9 @@ class BenchmarkRunner {
   // So only the first repetition has to find/calculate it,
   // the other repetitions will just use that precomputed iteration count.
 
+  PerfCountersMeasurement perf_counters_measurement;
+  PerfCountersMeasurement* const perf_counters_measurement_ptr;
+
   struct IterationResults {
     internal::ThreadManager::Result results;
     IterationCount iters;
@@ -206,12 +222,12 @@ class BenchmarkRunner {
     // Run all but one thread in separate threads
     for (std::size_t ti = 0; ti < pool.size(); ++ti) {
       pool[ti] = std::thread(&RunInThread, &b, iters, static_cast<int>(ti + 1),
-                             manager.get());
+                             manager.get(), perf_counters_measurement_ptr);
     }
     // And run one thread here directly.
     // (If we were asked to run just one thread, we don't create new threads.)
     // Yes, we need to do this here *after* we start the separate threads.
-    RunInThread(&b, iters, 0, manager.get());
+    RunInThread(&b, iters, 0, manager.get(), perf_counters_measurement_ptr);
 
     // The main thread has finished. Now let's wait for the other threads.
     manager->WaitForAllThreads();
@@ -331,7 +347,8 @@ class BenchmarkRunner {
       memory_manager->Start();
       std::unique_ptr<internal::ThreadManager> manager;
       manager.reset(new internal::ThreadManager(1));
-      RunInThread(&b, memory_iterations, 0, manager.get());
+      RunInThread(&b, memory_iterations, 0, manager.get(),
+                  perf_counters_measurement_ptr);
       manager->WaitForAllThreads();
       manager.reset();
 
diff --git a/src/benchmark_runner.h b/src/benchmark_runner.h
index 96e8282a..9b0cf2a6 100644
--- a/src/benchmark_runner.h
+++ b/src/benchmark_runner.h
@@ -26,6 +26,8 @@ DECLARE_bool(benchmark_report_aggregates_only);
 
 DECLARE_bool(benchmark_display_aggregates_only);
 
+DECLARE_string(benchmark_perf_counters);
+
 namespace benchmark {
 
 namespace internal {
diff --git a/src/perf_counters.cc b/src/perf_counters.cc
new file mode 100644
index 00000000..eb28cd99
--- /dev/null
+++ b/src/perf_counters.cc
@@ -0,0 +1,128 @@
+// Copyright 2021 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "perf_counters.h"
+
+#include <cstring>
+#include <vector>
+
+#if defined HAVE_LIBPFM
+#include "perfmon/pfmlib.h"
+#include "perfmon/pfmlib_perf_event.h"
+#endif
+
+namespace benchmark {
+namespace internal {
+
+constexpr size_t PerfCounterValues::kMaxCounters;
+
+#if defined HAVE_LIBPFM
+const bool PerfCounters::kSupported = true;
+
+bool PerfCounters::Initialize() { return pfm_initialize() == PFM_SUCCESS; }
+
+PerfCounters PerfCounters::Create(
+    const std::vector<std::string>& counter_names) {
+  if (counter_names.empty()) {
+    return NoCounters();
+  }
+  if (counter_names.size() > PerfCounterValues::kMaxCounters) {
+    GetErrorLogInstance()
+        << counter_names.size()
+        << " counters were requested. The minimum is 1, the maximum is "
+        << PerfCounterValues::kMaxCounters << "\n";
+    return NoCounters();
+  }
+  std::vector<int> counter_ids(counter_names.size());
+
+  const int mode = PFM_PLM3;  // user mode only
+  for (size_t i = 0; i < counter_names.size(); ++i) {
+    const bool is_first = i == 0;
+    struct perf_event_attr attr{};
+    attr.size = sizeof(attr);
+    const int group_id = !is_first ? counter_ids[0] : -1;
+    const auto& name = counter_names[i];
+    if (name.empty()) {
+      GetErrorLogInstance() << "A counter name was the empty string\n";
+      return NoCounters();
+    }
+    pfm_perf_encode_arg_t arg{};
+    arg.attr = &attr;
+
+    const int pfm_get =
+        pfm_get_os_event_encoding(name.c_str(), mode, PFM_OS_PERF_EVENT, &arg);
+    if (pfm_get != PFM_SUCCESS) {
+      GetErrorLogInstance() << "Unknown counter name: " << name << "\n";
+      return NoCounters();
+    }
+    attr.disabled = is_first;
+    attr.pinned = is_first;
+    attr.exclude_kernel = true;
+    attr.exclude_user = false;
+    attr.exclude_hv = true;
+    // Read all counters in one read.
+    attr.read_format = PERF_FORMAT_GROUP;
+
+    int id = -1;
+    static constexpr size_t kNrOfSyscallRetries = 5;
+    // Retry syscall as it was interrupted often (b/64774091).
+    for (size_t num_retries = 0; num_retries < kNrOfSyscallRetries;
+         ++num_retries) {
+      id = perf_event_open(&attr, 0, -1, group_id, 0);
+      if (id >= 0 || errno != EINTR) {
+        break;
+      }
+    }
+    if (id < 0) {
+      GetErrorLogInstance()
+          << "Failed to get a file descriptor for " << name << "\n";
+      return NoCounters();
+    }
+
+    counter_ids[i] = id;
+  }
+  if (ioctl(counter_ids[0], PERF_EVENT_IOC_ENABLE) != 0) {
+    GetErrorLogInstance() << "Failed to start counters\n";
+    return NoCounters();
+  }
+
+  return PerfCounters(counter_names, std::move(counter_ids));
+}
+
+PerfCounters::~PerfCounters() {
+  if (counter_ids_.empty()) {
+    return;
+  }
+  ioctl(counter_ids_[0], PERF_EVENT_IOC_DISABLE);
+  for (int fd : counter_ids_) {
+    close(fd);
+  }
+}
+#else   // defined HAVE_LIBPFM
+const bool PerfCounters::kSupported = false;
+
+bool PerfCounters::Initialize() { return false; }
+
+PerfCounters PerfCounters::Create(
+    const std::vector<std::string>& counter_names) {
+  if (!counter_names.empty()) {
+    GetErrorLogInstance() << "Performance counters not supported.";
+  }
+  return NoCounters();
+}
+
+PerfCounters::~PerfCounters() = default;
+#endif  // defined HAVE_LIBPFM
+}  // namespace internal
+}  // namespace benchmark
diff --git a/src/perf_counters.h b/src/perf_counters.h
new file mode 100644
index 00000000..0c2c4616
--- /dev/null
+++ b/src/perf_counters.h
@@ -0,0 +1,172 @@
+// Copyright 2021 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef BENCHMARK_PERF_COUNTERS_H
+#define BENCHMARK_PERF_COUNTERS_H
+
+#include <array>
+#include <cstdint>
+#include <vector>
+
+#include "benchmark/benchmark.h"
+#include "check.h"
+#include "log.h"
+
+#ifndef BENCHMARK_OS_WINDOWS
+#include <unistd.h>
+#endif
+
+namespace benchmark {
+namespace internal {
+
+// Typically, we can only read a small number of counters. There is also a
+// padding preceding counter values, when reading multiple counters with one
+// syscall (which is desirable). PerfCounterValues abstracts these details.
+// The implementation ensures the storage is inlined, and allows 0-based
+// indexing into the counter values.
+// The object is used in conjunction with a PerfCounters object, by passing it
+// to Snapshot(). The values are populated such that
+// perfCounters->names()[i]'s value is obtained at position i (as given by
+// operator[]) of this object.
+class PerfCounterValues {
+ public:
+  explicit PerfCounterValues(size_t nr_counters) : nr_counters_(nr_counters) {
+    CHECK_LE(nr_counters_, kMaxCounters);
+  }
+
+  uint64_t operator[](size_t pos) const { return values_[kPadding + pos]; }
+
+  static constexpr size_t kMaxCounters = 3;
+
+ private:
+  friend class PerfCounters;
+  // Get the byte buffer in which perf counters can be captured.
+  // This is used by PerfCounters::Read
+  std::pair<char*, size_t> get_data_buffer() {
+    return {reinterpret_cast<char*>(values_.data()),
+            sizeof(uint64_t) * (kPadding + nr_counters_)};
+  }
+
+  static constexpr size_t kPadding = 1;
+  std::array<uint64_t, kPadding + kMaxCounters> values_;
+  const size_t nr_counters_;
+};
+
+// Collect PMU counters. The object, once constructed, is ready to be used by
+// calling read(). PMU counter collection is enabled from the time create() is
+// called, to obtain the object, until the object's destructor is called.
+class PerfCounters final {
+ public:
+  // True iff this platform supports performance counters.
+  static const bool kSupported;
+
+  bool IsValid() const { return is_valid_; }
+  static PerfCounters NoCounters() { return PerfCounters(); }
+
+  ~PerfCounters();
+  PerfCounters(PerfCounters&&) = default;
+  PerfCounters(const PerfCounters&) = delete;
+
+  // Platform-specific implementations may choose to do some library
+  // initialization here.
+  static bool Initialize();
+
+  // Return a PerfCounters object ready to read the counters with the names
+  // specified. The values are user-mode only. The counter name format is
+  // implementation and OS specific.
+  // TODO: once we move to C++-17, this should be a std::optional, and then the
+  // IsValid() boolean can be dropped.
+  static PerfCounters Create(const std::vector<std::string>& counter_names);
+
+  // Take a snapshot of the current value of the counters into the provided
+  // valid PerfCounterValues storage. The values are populated such that:
+  // names()[i]'s value is (*values)[i]
+  BENCHMARK_ALWAYS_INLINE bool Snapshot(PerfCounterValues* values) {
+#ifndef BENCHMARK_OS_WINDOWS
+    assert(values != nullptr);
+    assert(IsValid());
+    auto buffer = values->get_data_buffer();
+    auto read_bytes = ::read(counter_ids_[0], buffer.first, buffer.second);
+    return static_cast<size_t>(read_bytes) == buffer.second;
+#else
+    (void)values;
+    return false;
+#endif
+  }
+
+  const std::vector<std::string>& names() const { return counter_names_; }
+  size_t num_counters() const { return counter_names_.size(); }
+
+ private:
+  PerfCounters(const std::vector<std::string>& counter_names,
+               std::vector<int>&& counter_ids)
+      : counter_ids_(std::move(counter_ids)),
+        counter_names_(counter_names),
+        is_valid_(true) {}
+  PerfCounters() : is_valid_(false) {}
+
+  std::vector<int> counter_ids_;
+  const std::vector<std::string> counter_names_;
+  const bool is_valid_;
+};
+
+// Typical usage of the above primitives.
+class PerfCountersMeasurement final {
+ public:
+  PerfCountersMeasurement(PerfCounters&& c)
+      : counters_(std::move(c)),
+        start_values_(counters_.IsValid() ? counters_.names().size() : 0),
+        end_values_(counters_.IsValid() ? counters_.names().size() : 0) {}
+
+  bool IsValid() const { return counters_.IsValid(); }
+
+  BENCHMARK_ALWAYS_INLINE void Start() {
+    assert(IsValid());
+    // Tell the compiler to not move instructions above/below where we take
+    // the snapshot.
+    ClobberMemory();
+    counters_.Snapshot(&start_values_);
+    ClobberMemory();
+  }
+
+  BENCHMARK_ALWAYS_INLINE std::vector<std::pair<std::string, double>>
+  StopAndGetMeasurements() {
+    assert(IsValid());
+    // Tell the compiler to not move instructions above/below where we take
+    // the snapshot.
+    ClobberMemory();
+    counters_.Snapshot(&end_values_);
+    ClobberMemory();
+
+    std::vector<std::pair<std::string, double>> ret;
+    for (size_t i = 0; i < counters_.names().size(); ++i) {
+      double measurement = static_cast<double>(end_values_[i]) -
+                           static_cast<double>(start_values_[i]);
+      ret.push_back({counters_.names()[i], measurement});
+    }
+    return ret;
+  }
+
+ private:
+  PerfCounters counters_;
+  PerfCounterValues start_values_;
+  PerfCounterValues end_values_;
+};
+
+BENCHMARK_UNUSED static bool perf_init_anchor = PerfCounters::Initialize();
+
+}  // namespace internal
+}  // namespace benchmark
+
+#endif  // BENCHMARK_PERF_COUNTERS_H
diff --git a/src/string_util.cc b/src/string_util.cc
index ac60b558..53b1532b 100644
--- a/src/string_util.cc
+++ b/src/string_util.cc
@@ -163,6 +163,18 @@ std::string StrFormat(const char* format, ...) {
   return tmp;
 }
 
+std::vector<std::string> StrSplit(const std::string& str, char delim) {
+  std::vector<std::string> ret;
+  size_t first = 0;
+  size_t next = str.find(delim);
+  for (; next != std::string::npos;
+       first = next + 1, next = str.find(first, delim)) {
+    ret.push_back(str.substr(first, next - first));
+  }
+  ret.push_back(str.substr(first));
+  return ret;
+}
+
 #ifdef BENCHMARK_STL_ANDROID_GNUSTL
 /*
  * GNU STL in Android NDK lacks support for some C++11 functions, including
diff --git a/src/string_util.h b/src/string_util.h
index 09d7b4bd..6bc28b69 100644
--- a/src/string_util.h
+++ b/src/string_util.h
@@ -37,6 +37,8 @@ inline std::string StrCat(Args&&... args) {
   return ss.str();
 }
 
+std::vector<std::string> StrSplit(const std::string& str, char delim);
+
 #ifdef BENCHMARK_STL_ANDROID_GNUSTL
 /*
  * GNU STL in Android NDK lacks support for some C++11 functions, including
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index c1a3a3fc..1e7b6829 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -128,6 +128,9 @@ add_test(NAME templated_fixture_test COMMAND templated_fixture_test --benchmark_
 compile_output_test(user_counters_test)
 add_test(NAME user_counters_test COMMAND user_counters_test --benchmark_min_time=0.01)
 
+compile_output_test(perf_counters_test)
+add_test(NAME perf_counters_test COMMAND perf_counters_test --benchmark_min_time=0.01 --benchmark_perf_counters=CYCLES,BRANCHES)
+
 compile_output_test(internal_threading_test)
 add_test(NAME internal_threading_test COMMAND internal_threading_test --benchmark_min_time=0.01)
 
@@ -196,6 +199,7 @@ if (BENCHMARK_ENABLE_GTEST_TESTS)
   add_gtest(commandlineflags_gtest)
   add_gtest(statistics_gtest)
   add_gtest(string_util_gtest)
+  add_gtest(perf_counters_gtest)
 endif(BENCHMARK_ENABLE_GTEST_TESTS)
 
 ###############################################################################
diff --git a/test/perf_counters_gtest.cc b/test/perf_counters_gtest.cc
new file mode 100644
index 00000000..47894af4
--- /dev/null
+++ b/test/perf_counters_gtest.cc
@@ -0,0 +1,95 @@
+#include "../src/perf_counters.h"
+#include "gtest/gtest.h"
+
+#ifndef GTEST_SKIP
+struct MsgHandler {
+  void operator=(std::ostream&){}
+};
+#define GTEST_SKIP() return MsgHandler() = std::cout
+#endif
+
+using benchmark::internal::PerfCounters;
+using benchmark::internal::PerfCounterValues;
+
+namespace {
+const char kGenericPerfEvent1[] = "CYCLES";
+const char kGenericPerfEvent2[] = "BRANCHES";
+const char kGenericPerfEvent3[] = "INSTRUCTIONS";
+
+TEST(PerfCountersTest, Init) {
+  EXPECT_EQ(PerfCounters::Initialize(), PerfCounters::kSupported);
+}
+
+TEST(PerfCountersTest, OneCounter) {
+  if (!PerfCounters::kSupported) {
+    GTEST_SKIP() << "Performance counters not supported.\n";
+  }
+  EXPECT_TRUE(PerfCounters::Initialize());
+  EXPECT_TRUE(PerfCounters::Create({kGenericPerfEvent1}).IsValid());
+}
+
+TEST(PerfCountersTest, NegativeTest) {
+  if (!PerfCounters::kSupported) {
+    EXPECT_FALSE(PerfCounters::Initialize());
+    return;
+  }
+  EXPECT_TRUE(PerfCounters::Initialize());
+  EXPECT_FALSE(PerfCounters::Create({}).IsValid());
+  EXPECT_FALSE(PerfCounters::Create({""}).IsValid());
+  EXPECT_FALSE(PerfCounters::Create({"not a counter name"}).IsValid());
+  {
+    EXPECT_TRUE(PerfCounters::Create({kGenericPerfEvent1, kGenericPerfEvent2,
+                                      kGenericPerfEvent3})
+                    .IsValid());
+  }
+  EXPECT_FALSE(
+      PerfCounters::Create({kGenericPerfEvent2, "", kGenericPerfEvent1})
+          .IsValid());
+  EXPECT_FALSE(PerfCounters::Create({kGenericPerfEvent3, "not a counter name",
+                                     kGenericPerfEvent1})
+                   .IsValid());
+  {
+    EXPECT_TRUE(PerfCounters::Create({kGenericPerfEvent1, kGenericPerfEvent2,
+                                      kGenericPerfEvent3})
+                    .IsValid());
+  }
+  EXPECT_FALSE(
+      PerfCounters::Create({kGenericPerfEvent1, kGenericPerfEvent2,
+                            kGenericPerfEvent3, "MISPREDICTED_BRANCH_RETIRED"})
+          .IsValid());
+}
+
+TEST(PerfCountersTest, Read1Counter) {
+  if (!PerfCounters::kSupported) {
+    GTEST_SKIP() << "Test skipped because libpfm is not supported.\n";
+  }
+  EXPECT_TRUE(PerfCounters::Initialize());
+  auto counters = PerfCounters::Create({kGenericPerfEvent1});
+  EXPECT_TRUE(counters.IsValid());
+  PerfCounterValues values1(1);
+  EXPECT_TRUE(counters.Snapshot(&values1));
+  EXPECT_GT(values1[0], 0);
+  PerfCounterValues values2(1);
+  EXPECT_TRUE(counters.Snapshot(&values2));
+  EXPECT_GT(values2[0], 0);
+  EXPECT_GT(values2[0], values1[0]);
+}
+
+TEST(PerfCountersTest, Read2Counters) {
+  if (!PerfCounters::kSupported) {
+    GTEST_SKIP() << "Test skipped because libpfm is not supported.\n";
+  }
+  EXPECT_TRUE(PerfCounters::Initialize());
+  auto counters =
+      PerfCounters::Create({kGenericPerfEvent1, kGenericPerfEvent2});
+  EXPECT_TRUE(counters.IsValid());
+  PerfCounterValues values1(2);
+  EXPECT_TRUE(counters.Snapshot(&values1));
+  EXPECT_GT(values1[0], 0);
+  EXPECT_GT(values1[1], 0);
+  PerfCounterValues values2(2);
+  EXPECT_TRUE(counters.Snapshot(&values2));
+  EXPECT_GT(values2[0], 0);
+  EXPECT_GT(values2[1], 0);
+}
+}  // namespace
diff --git a/test/perf_counters_test.cc b/test/perf_counters_test.cc
new file mode 100644
index 00000000..d6e0284d
--- /dev/null
+++ b/test/perf_counters_test.cc
@@ -0,0 +1,27 @@
+#undef NDEBUG
+
+#include "../src/perf_counters.h"
+
+#include "benchmark/benchmark.h"
+#include "output_test.h"
+
+void BM_Simple(benchmark::State& state) {
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(state.iterations());
+  }
+}
+BENCHMARK(BM_Simple);
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Simple\",$"}});
+
+void CheckSimple(Results const& e) {
+  CHECK_COUNTER_VALUE(e, double, "CYCLES", GT, 0);
+  CHECK_COUNTER_VALUE(e, double, "BRANCHES", GT, 0.0);
+}
+CHECK_BENCHMARK_RESULTS("BM_Simple", &CheckSimple);
+
+int main(int argc, char* argv[]) {
+  if (!benchmark::internal::PerfCounters::kSupported) {
+    return 0;
+  }
+  RunOutputTests(argc, argv);
+}
diff --git a/test/string_util_gtest.cc b/test/string_util_gtest.cc
index 01bf155d..77a719a6 100644
--- a/test/string_util_gtest.cc
+++ b/test/string_util_gtest.cc
@@ -150,4 +150,12 @@ TEST(StringUtilTest, stod) {
 #endif
 }
 
+TEST(StringUtilTest, StrSplit) {
+  EXPECT_EQ(benchmark::StrSplit("", ','), std::vector<std::string>{""});
+  EXPECT_EQ(benchmark::StrSplit("hello", ','),
+            std::vector<std::string>({"hello"}));
+  EXPECT_EQ(benchmark::StrSplit("hello,there", ','),
+            std::vector<std::string>({"hello", "there"}));
+}
+
 }  // end namespace