From 376ebc26354ca2b79af94467133f3c35b539627e Mon Sep 17 00:00:00 2001 From: Mircea Trofin Date: Wed, 28 Apr 2021 01:25:29 -0700 Subject: [PATCH] Support optional, user-directed collection of performance counters (#1114) * Support optional, user-directed collection of performance counters The patch allows an engineer wishing to drill into the root causes of a regression, for example. Currently, only single threaded runs are supported. The feature is a build-time opt in, and then a runtime opt in. The engineer may run the benchmark executable, passing a list of performance counter names (using libpfm's naming scheme) at the command line. The counter values will then be collected and reported back as UserCounters. This is different from #240 in that it is a benchmark user opt-in, and the counter collection is transparent to the benchmark. Currently, this is only supported on platforms where libpfm is supported. libpfm: http://perfmon2.sourceforge.net/ * 'Use' values param in Snapshot when BENCHMARK_OS_WINDOWS This is to avoid unused parameter warning-as-error * Added missing include for in perf_counters.cc * Moved doc to docs * Added license blurbs --- .../workflows/build-and-test-perfcounters.yml | 44 +++++ CMakeLists.txt | 4 + README.md | 2 + cmake/Modules/FindPFM.cmake | 19 ++ docs/perf_counters.md | 35 ++++ include/benchmark/benchmark.h | 9 +- src/CMakeLists.txt | 6 + src/benchmark.cc | 27 ++- src/benchmark_api_internal.cc | 10 +- src/benchmark_api_internal.h | 3 +- src/benchmark_runner.cc | 29 ++- src/benchmark_runner.h | 2 + src/perf_counters.cc | 128 +++++++++++++ src/perf_counters.h | 172 ++++++++++++++++++ src/string_util.cc | 12 ++ src/string_util.h | 2 + test/CMakeLists.txt | 4 + test/perf_counters_gtest.cc | 95 ++++++++++ test/perf_counters_test.cc | 27 +++ test/string_util_gtest.cc | 8 + 20 files changed, 621 insertions(+), 17 deletions(-) create mode 100644 .github/workflows/build-and-test-perfcounters.yml create mode 100644 cmake/Modules/FindPFM.cmake create mode 100644 docs/perf_counters.md create mode 100644 src/perf_counters.cc create mode 100644 src/perf_counters.h create mode 100644 test/perf_counters_gtest.cc create mode 100644 test/perf_counters_test.cc diff --git a/.github/workflows/build-and-test-perfcounters.yml b/.github/workflows/build-and-test-perfcounters.yml new file mode 100644 index 00000000..dfb88cbc --- /dev/null +++ b/.github/workflows/build-and-test-perfcounters.yml @@ -0,0 +1,44 @@ +name: build-and-test-perfcounters + +on: + push: + branches: [ master ] + pull_request: + branches: [ master ] + +jobs: + job: + # TODO(dominic): Extend this to include compiler and set through env: CC/CXX. + name: ${{ matrix.os }}.${{ matrix.build_type }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, ubuntu-16.04, ubuntu-20.04] + build_type: ['Release', 'Debug'] + steps: + - uses: actions/checkout@v2 + + - name: install libpfm + run: sudo apt install libpfm4-dev + + - name: create build environment + run: cmake -E make_directory ${{ runner.workspace }}/_build + + - name: configure cmake + shell: bash + working-directory: ${{ runner.workspace }}/_build + run: cmake -DBENCHMARK_ENABLE_LIBPFM=1 -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} + + - name: build + shell: bash + working-directory: ${{ runner.workspace }}/_build + run: cmake --build . --config ${{ matrix.build_type }} + + # Skip testing, for now. It seems perf_event_open does not succeed on the + # hosting machine, very likely a permissions issue. + # TODO(mtrofin): Enable test. + # - name: test + # shell: bash + # working-directory: ${{ runner.workspace }}/_build + # run: sudo ctest -C ${{ matrix.build_type }} --rerun-failed --output-on-failure diff --git a/CMakeLists.txt b/CMakeLists.txt index 10072545..7e0f251f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -270,6 +270,10 @@ cxx_feature_check(STEADY_CLOCK) set(THREADS_PREFER_PTHREAD_FLAG ON) find_package(Threads REQUIRED) +if (BENCHMARK_ENABLE_LIBPFM) + find_package(PFM) +endif() + # Set up directories include_directories(${PROJECT_SOURCE_DIR}/include) diff --git a/README.md b/README.md index ceedb334..e8c65f8d 100644 --- a/README.md +++ b/README.md @@ -297,6 +297,8 @@ too (`-lkstat`). [Setting the Time Unit](#setting-the-time-unit) +[User-Requested Performance Counters](docs/perf_counters.md) + [Preventing Optimization](#preventing-optimization) [Reporting Statistics](#reporting-statistics) diff --git a/cmake/Modules/FindPFM.cmake b/cmake/Modules/FindPFM.cmake new file mode 100644 index 00000000..553d458c --- /dev/null +++ b/cmake/Modules/FindPFM.cmake @@ -0,0 +1,19 @@ +# If successful, the following variables will be defined: +# HAVE_LIBPFM. +# Set BENCHMARK_ENABLE_LIBPFM to 0 to disable, regardless of libpfm presence. +include(CheckIncludeFile) +include(CheckLibraryExists) +enable_language(C) + +check_library_exists(libpfm.a pfm_initialize "" HAVE_LIBPFM_INITIALIZE) +if(HAVE_LIBPFM_INITIALIZE) + check_include_file(perfmon/perf_event.h HAVE_PERFMON_PERF_EVENT_H) + check_include_file(perfmon/pfmlib.h HAVE_PERFMON_PFMLIB_H) + check_include_file(perfmon/pfmlib_perf_event.h HAVE_PERFMON_PFMLIB_PERF_EVENT_H) + if(HAVE_PERFMON_PERF_EVENT_H AND HAVE_PERFMON_PFMLIB_H AND HAVE_PERFMON_PFMLIB_PERF_EVENT_H) + message("Using Perf Counters.") + set(HAVE_LIBPFM 1) + endif() +else() + message("Perf Counters support requested, but was unable to find libpfm.") +endif() diff --git a/docs/perf_counters.md b/docs/perf_counters.md new file mode 100644 index 00000000..43ff4517 --- /dev/null +++ b/docs/perf_counters.md @@ -0,0 +1,35 @@ + + +# User-Requested Performance Counters + +When running benchmarks, the user may choose to request collection of +performance counters. This may be useful in investigation scenarios - narrowing +down the cause of a regression; or verifying that the underlying cause of a +performance improvement matches expectations. + +This feature is available if: + +* The benchmark is run on an architecture featuring a Performance Monitoring + Unit (PMU), +* The benchmark is compiled with support for collecting counters. Currently, + this requires [libpfm](http://perfmon2.sourceforge.net/) be available at build + time, and +* Currently, there is a limitation that the benchmark be run on one thread. + +The feature does not require modifying benchmark code. Counter collection is +handled at the boundaries where timer collection is also handled. + +To opt-in: + +* Install `libpfm4-dev`, e.g. `apt-get install libpfm4-dev`. +* Enable the cmake flag BENCHMARK_ENABLE_LIBPFM. + +To use, pass a comma-separated list of counter names through the +`--benchmark_perf_counters` flag. The names are decoded through libpfm - meaning, +they are platform specific, but some (e.g. `CYCLES` or `INSTRUCTIONS`) are +mapped by libpfm to platform-specifics - see libpfm +[documentation](http://perfmon2.sourceforge.net/docs.html) for more details. + +The counter values are reported back through the [User Counters](../README.md#custom-counters) +mechanism, meaning, they are available in all the formats (e.g. JSON) supported +by User Counters. \ No newline at end of file diff --git a/include/benchmark/benchmark.h b/include/benchmark/benchmark.h index 881ce9e5..664422b3 100644 --- a/include/benchmark/benchmark.h +++ b/include/benchmark/benchmark.h @@ -448,6 +448,7 @@ struct Statistics { struct BenchmarkInstance; class ThreadTimer; class ThreadManager; +class PerfCountersMeasurement; enum AggregationReportMode #if defined(BENCHMARK_HAS_CXX11) @@ -687,15 +688,17 @@ class State { private: State(IterationCount max_iters, const std::vector& ranges, int thread_i, int n_threads, internal::ThreadTimer* timer, - internal::ThreadManager* manager); + internal::ThreadManager* manager, + internal::PerfCountersMeasurement* perf_counters_measurement); void StartKeepRunning(); // Implementation of KeepRunning() and KeepRunningBatch(). // is_batch must be true unless n is 1. bool KeepRunningInternal(IterationCount n, bool is_batch); void FinishKeepRunning(); - internal::ThreadTimer* timer_; - internal::ThreadManager* manager_; + internal::ThreadTimer* const timer_; + internal::ThreadManager* const manager_; + internal::PerfCountersMeasurement* const perf_counters_measurement_; friend struct internal::BenchmarkInstance; }; diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 35d559ee..a6c8e9a7 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -28,6 +28,12 @@ target_include_directories(benchmark PUBLIC $ ) +# libpfm, if available +if (HAVE_LIBPFM) + target_link_libraries(benchmark libpfm.a) + add_definitions(-DHAVE_LIBPFM) +endif() + # Link threads. target_link_libraries(benchmark ${BENCHMARK_CXX_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT}) find_library(LIBRT rt) diff --git a/src/benchmark.cc b/src/benchmark.cc index ffe4bf45..1fea654c 100644 --- a/src/benchmark.cc +++ b/src/benchmark.cc @@ -45,6 +45,7 @@ #include "internal_macros.h" #include "log.h" #include "mutex.h" +#include "perf_counters.h" #include "re.h" #include "statistics.h" #include "string_util.h" @@ -106,6 +107,10 @@ DEFINE_bool(benchmark_counters_tabular, false); // The level of verbose logging to output DEFINE_int32(v, 0); +// List of additional perf counters to collect, in libpfm format. For more +// information about libpfm: https://man7.org/linux/man-pages/man3/libpfm.3.html +DEFINE_string(benchmark_perf_counters, ""); + namespace benchmark { namespace internal { @@ -117,7 +122,8 @@ void UseCharPointer(char const volatile*) {} State::State(IterationCount max_iters, const std::vector& ranges, int thread_i, int n_threads, internal::ThreadTimer* timer, - internal::ThreadManager* manager) + internal::ThreadManager* manager, + internal::PerfCountersMeasurement* perf_counters_measurement) : total_iterations_(0), batch_leftover_(0), max_iterations(max_iters), @@ -130,7 +136,8 @@ State::State(IterationCount max_iters, const std::vector& ranges, thread_index(thread_i), threads(n_threads), timer_(timer), - manager_(manager) { + manager_(manager), + perf_counters_measurement_(perf_counters_measurement) { CHECK(max_iterations != 0) << "At least one iteration must be run"; CHECK_LT(thread_index, threads) << "thread_index must be less than threads"; @@ -163,11 +170,23 @@ void State::PauseTiming() { // Add in time accumulated so far CHECK(started_ && !finished_ && !error_occurred_); timer_->StopTimer(); + if (perf_counters_measurement_) { + auto measurements = perf_counters_measurement_->StopAndGetMeasurements(); + for (const auto& name_and_measurement : measurements) { + auto name = name_and_measurement.first; + auto measurement = name_and_measurement.second; + CHECK_EQ(counters[name], 0.0); + counters[name] = Counter(measurement, Counter::kAvgIterations); + } + } } void State::ResumeTiming() { CHECK(started_ && !finished_ && !error_occurred_); timer_->StartTimer(); + if (perf_counters_measurement_) { + perf_counters_measurement_->Start(); + } } void State::SkipWithError(const char* msg) { @@ -457,7 +476,9 @@ void ParseCommandLineFlags(int* argc, char** argv) { ParseStringFlag(argv[i], "color_print", &FLAGS_benchmark_color) || ParseBoolFlag(argv[i], "benchmark_counters_tabular", &FLAGS_benchmark_counters_tabular) || - ParseInt32Flag(argv[i], "v", &FLAGS_v)) { + ParseInt32Flag(argv[i], "v", &FLAGS_v) || + ParseStringFlag(argv[i], "benchmark_perf_counters", + &FLAGS_benchmark_perf_counters)) { for (int j = i; j != *argc - 1; ++j) argv[j] = argv[j + 1]; --(*argc); diff --git a/src/benchmark_api_internal.cc b/src/benchmark_api_internal.cc index d468a257..804ef894 100644 --- a/src/benchmark_api_internal.cc +++ b/src/benchmark_api_internal.cc @@ -3,10 +3,12 @@ namespace benchmark { namespace internal { -State BenchmarkInstance::Run(IterationCount iters, int thread_id, - internal::ThreadTimer* timer, - internal::ThreadManager* manager) const { - State st(iters, arg, thread_id, threads, timer, manager); +State BenchmarkInstance::Run( + IterationCount iters, int thread_id, internal::ThreadTimer* timer, + internal::ThreadManager* manager, + internal::PerfCountersMeasurement* perf_counters_measurement) const { + State st(iters, arg, thread_id, threads, timer, manager, + perf_counters_measurement); benchmark->Run(st); return st; } diff --git a/src/benchmark_api_internal.h b/src/benchmark_api_internal.h index 264eff95..b740bce1 100644 --- a/src/benchmark_api_internal.h +++ b/src/benchmark_api_internal.h @@ -36,7 +36,8 @@ struct BenchmarkInstance { int threads; // Number of concurrent threads to us State Run(IterationCount iters, int thread_id, internal::ThreadTimer* timer, - internal::ThreadManager* manager) const; + internal::ThreadManager* manager, + internal::PerfCountersMeasurement* perf_counters_measurement) const; }; bool FindBenchmarksInternal(const std::string& re, diff --git a/src/benchmark_runner.cc b/src/benchmark_runner.cc index d081aa86..083d1849 100644 --- a/src/benchmark_runner.cc +++ b/src/benchmark_runner.cc @@ -45,6 +45,7 @@ #include "internal_macros.h" #include "log.h" #include "mutex.h" +#include "perf_counters.h" #include "re.h" #include "statistics.h" #include "string_util.h" @@ -111,12 +112,14 @@ BenchmarkReporter::Run CreateRunReport( // Execute one thread of benchmark b for the specified number of iterations. // Adds the stats collected for the thread into manager->results. void RunInThread(const BenchmarkInstance* b, IterationCount iters, - int thread_id, ThreadManager* manager) { + int thread_id, ThreadManager* manager, + PerfCountersMeasurement* perf_counters_measurement) { internal::ThreadTimer timer( b->measure_process_cpu_time ? internal::ThreadTimer::CreateProcessCpuTime() : internal::ThreadTimer::Create()); - State st = b->Run(iters, thread_id, &timer, manager); + State st = + b->Run(iters, thread_id, &timer, manager, perf_counters_measurement); CHECK(st.error_occurred() || st.iterations() >= st.max_iterations) << "Benchmark returned before State::KeepRunning() returned false!"; { @@ -143,7 +146,12 @@ class BenchmarkRunner { : FLAGS_benchmark_repetitions), has_explicit_iteration_count(b.iterations != 0), pool(b.threads - 1), - iters(has_explicit_iteration_count ? b.iterations : 1) { + iters(has_explicit_iteration_count ? b.iterations : 1), + perf_counters_measurement( + PerfCounters::Create(StrSplit(FLAGS_benchmark_perf_counters, ','))), + perf_counters_measurement_ptr(perf_counters_measurement.IsValid() + ? &perf_counters_measurement + : nullptr) { run_results.display_report_aggregates_only = (FLAGS_benchmark_report_aggregates_only || FLAGS_benchmark_display_aggregates_only); @@ -155,6 +163,11 @@ class BenchmarkRunner { internal::ARM_DisplayReportAggregatesOnly); run_results.file_report_aggregates_only = (b.aggregation_report_mode & internal::ARM_FileReportAggregatesOnly); + CHECK(b.threads == 1 || !perf_counters_measurement.IsValid()) + << "Perf counters are not supported in multi-threaded cases.\n"; + CHECK(FLAGS_benchmark_perf_counters.empty() || + perf_counters_measurement.IsValid()) + << "Perf counters were requested but could not be set up."; } for (int repetition_num = 0; repetition_num < repeats; repetition_num++) { @@ -192,6 +205,9 @@ class BenchmarkRunner { // So only the first repetition has to find/calculate it, // the other repetitions will just use that precomputed iteration count. + PerfCountersMeasurement perf_counters_measurement; + PerfCountersMeasurement* const perf_counters_measurement_ptr; + struct IterationResults { internal::ThreadManager::Result results; IterationCount iters; @@ -206,12 +222,12 @@ class BenchmarkRunner { // Run all but one thread in separate threads for (std::size_t ti = 0; ti < pool.size(); ++ti) { pool[ti] = std::thread(&RunInThread, &b, iters, static_cast(ti + 1), - manager.get()); + manager.get(), perf_counters_measurement_ptr); } // And run one thread here directly. // (If we were asked to run just one thread, we don't create new threads.) // Yes, we need to do this here *after* we start the separate threads. - RunInThread(&b, iters, 0, manager.get()); + RunInThread(&b, iters, 0, manager.get(), perf_counters_measurement_ptr); // The main thread has finished. Now let's wait for the other threads. manager->WaitForAllThreads(); @@ -331,7 +347,8 @@ class BenchmarkRunner { memory_manager->Start(); std::unique_ptr manager; manager.reset(new internal::ThreadManager(1)); - RunInThread(&b, memory_iterations, 0, manager.get()); + RunInThread(&b, memory_iterations, 0, manager.get(), + perf_counters_measurement_ptr); manager->WaitForAllThreads(); manager.reset(); diff --git a/src/benchmark_runner.h b/src/benchmark_runner.h index 96e8282a..9b0cf2a6 100644 --- a/src/benchmark_runner.h +++ b/src/benchmark_runner.h @@ -26,6 +26,8 @@ DECLARE_bool(benchmark_report_aggregates_only); DECLARE_bool(benchmark_display_aggregates_only); +DECLARE_string(benchmark_perf_counters); + namespace benchmark { namespace internal { diff --git a/src/perf_counters.cc b/src/perf_counters.cc new file mode 100644 index 00000000..eb28cd99 --- /dev/null +++ b/src/perf_counters.cc @@ -0,0 +1,128 @@ +// Copyright 2021 Google Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "perf_counters.h" + +#include +#include + +#if defined HAVE_LIBPFM +#include "perfmon/pfmlib.h" +#include "perfmon/pfmlib_perf_event.h" +#endif + +namespace benchmark { +namespace internal { + +constexpr size_t PerfCounterValues::kMaxCounters; + +#if defined HAVE_LIBPFM +const bool PerfCounters::kSupported = true; + +bool PerfCounters::Initialize() { return pfm_initialize() == PFM_SUCCESS; } + +PerfCounters PerfCounters::Create( + const std::vector& counter_names) { + if (counter_names.empty()) { + return NoCounters(); + } + if (counter_names.size() > PerfCounterValues::kMaxCounters) { + GetErrorLogInstance() + << counter_names.size() + << " counters were requested. The minimum is 1, the maximum is " + << PerfCounterValues::kMaxCounters << "\n"; + return NoCounters(); + } + std::vector counter_ids(counter_names.size()); + + const int mode = PFM_PLM3; // user mode only + for (size_t i = 0; i < counter_names.size(); ++i) { + const bool is_first = i == 0; + struct perf_event_attr attr{}; + attr.size = sizeof(attr); + const int group_id = !is_first ? counter_ids[0] : -1; + const auto& name = counter_names[i]; + if (name.empty()) { + GetErrorLogInstance() << "A counter name was the empty string\n"; + return NoCounters(); + } + pfm_perf_encode_arg_t arg{}; + arg.attr = &attr; + + const int pfm_get = + pfm_get_os_event_encoding(name.c_str(), mode, PFM_OS_PERF_EVENT, &arg); + if (pfm_get != PFM_SUCCESS) { + GetErrorLogInstance() << "Unknown counter name: " << name << "\n"; + return NoCounters(); + } + attr.disabled = is_first; + attr.pinned = is_first; + attr.exclude_kernel = true; + attr.exclude_user = false; + attr.exclude_hv = true; + // Read all counters in one read. + attr.read_format = PERF_FORMAT_GROUP; + + int id = -1; + static constexpr size_t kNrOfSyscallRetries = 5; + // Retry syscall as it was interrupted often (b/64774091). + for (size_t num_retries = 0; num_retries < kNrOfSyscallRetries; + ++num_retries) { + id = perf_event_open(&attr, 0, -1, group_id, 0); + if (id >= 0 || errno != EINTR) { + break; + } + } + if (id < 0) { + GetErrorLogInstance() + << "Failed to get a file descriptor for " << name << "\n"; + return NoCounters(); + } + + counter_ids[i] = id; + } + if (ioctl(counter_ids[0], PERF_EVENT_IOC_ENABLE) != 0) { + GetErrorLogInstance() << "Failed to start counters\n"; + return NoCounters(); + } + + return PerfCounters(counter_names, std::move(counter_ids)); +} + +PerfCounters::~PerfCounters() { + if (counter_ids_.empty()) { + return; + } + ioctl(counter_ids_[0], PERF_EVENT_IOC_DISABLE); + for (int fd : counter_ids_) { + close(fd); + } +} +#else // defined HAVE_LIBPFM +const bool PerfCounters::kSupported = false; + +bool PerfCounters::Initialize() { return false; } + +PerfCounters PerfCounters::Create( + const std::vector& counter_names) { + if (!counter_names.empty()) { + GetErrorLogInstance() << "Performance counters not supported."; + } + return NoCounters(); +} + +PerfCounters::~PerfCounters() = default; +#endif // defined HAVE_LIBPFM +} // namespace internal +} // namespace benchmark diff --git a/src/perf_counters.h b/src/perf_counters.h new file mode 100644 index 00000000..0c2c4616 --- /dev/null +++ b/src/perf_counters.h @@ -0,0 +1,172 @@ +// Copyright 2021 Google Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef BENCHMARK_PERF_COUNTERS_H +#define BENCHMARK_PERF_COUNTERS_H + +#include +#include +#include + +#include "benchmark/benchmark.h" +#include "check.h" +#include "log.h" + +#ifndef BENCHMARK_OS_WINDOWS +#include +#endif + +namespace benchmark { +namespace internal { + +// Typically, we can only read a small number of counters. There is also a +// padding preceding counter values, when reading multiple counters with one +// syscall (which is desirable). PerfCounterValues abstracts these details. +// The implementation ensures the storage is inlined, and allows 0-based +// indexing into the counter values. +// The object is used in conjunction with a PerfCounters object, by passing it +// to Snapshot(). The values are populated such that +// perfCounters->names()[i]'s value is obtained at position i (as given by +// operator[]) of this object. +class PerfCounterValues { + public: + explicit PerfCounterValues(size_t nr_counters) : nr_counters_(nr_counters) { + CHECK_LE(nr_counters_, kMaxCounters); + } + + uint64_t operator[](size_t pos) const { return values_[kPadding + pos]; } + + static constexpr size_t kMaxCounters = 3; + + private: + friend class PerfCounters; + // Get the byte buffer in which perf counters can be captured. + // This is used by PerfCounters::Read + std::pair get_data_buffer() { + return {reinterpret_cast(values_.data()), + sizeof(uint64_t) * (kPadding + nr_counters_)}; + } + + static constexpr size_t kPadding = 1; + std::array values_; + const size_t nr_counters_; +}; + +// Collect PMU counters. The object, once constructed, is ready to be used by +// calling read(). PMU counter collection is enabled from the time create() is +// called, to obtain the object, until the object's destructor is called. +class PerfCounters final { + public: + // True iff this platform supports performance counters. + static const bool kSupported; + + bool IsValid() const { return is_valid_; } + static PerfCounters NoCounters() { return PerfCounters(); } + + ~PerfCounters(); + PerfCounters(PerfCounters&&) = default; + PerfCounters(const PerfCounters&) = delete; + + // Platform-specific implementations may choose to do some library + // initialization here. + static bool Initialize(); + + // Return a PerfCounters object ready to read the counters with the names + // specified. The values are user-mode only. The counter name format is + // implementation and OS specific. + // TODO: once we move to C++-17, this should be a std::optional, and then the + // IsValid() boolean can be dropped. + static PerfCounters Create(const std::vector& counter_names); + + // Take a snapshot of the current value of the counters into the provided + // valid PerfCounterValues storage. The values are populated such that: + // names()[i]'s value is (*values)[i] + BENCHMARK_ALWAYS_INLINE bool Snapshot(PerfCounterValues* values) { +#ifndef BENCHMARK_OS_WINDOWS + assert(values != nullptr); + assert(IsValid()); + auto buffer = values->get_data_buffer(); + auto read_bytes = ::read(counter_ids_[0], buffer.first, buffer.second); + return static_cast(read_bytes) == buffer.second; +#else + (void)values; + return false; +#endif + } + + const std::vector& names() const { return counter_names_; } + size_t num_counters() const { return counter_names_.size(); } + + private: + PerfCounters(const std::vector& counter_names, + std::vector&& counter_ids) + : counter_ids_(std::move(counter_ids)), + counter_names_(counter_names), + is_valid_(true) {} + PerfCounters() : is_valid_(false) {} + + std::vector counter_ids_; + const std::vector counter_names_; + const bool is_valid_; +}; + +// Typical usage of the above primitives. +class PerfCountersMeasurement final { + public: + PerfCountersMeasurement(PerfCounters&& c) + : counters_(std::move(c)), + start_values_(counters_.IsValid() ? counters_.names().size() : 0), + end_values_(counters_.IsValid() ? counters_.names().size() : 0) {} + + bool IsValid() const { return counters_.IsValid(); } + + BENCHMARK_ALWAYS_INLINE void Start() { + assert(IsValid()); + // Tell the compiler to not move instructions above/below where we take + // the snapshot. + ClobberMemory(); + counters_.Snapshot(&start_values_); + ClobberMemory(); + } + + BENCHMARK_ALWAYS_INLINE std::vector> + StopAndGetMeasurements() { + assert(IsValid()); + // Tell the compiler to not move instructions above/below where we take + // the snapshot. + ClobberMemory(); + counters_.Snapshot(&end_values_); + ClobberMemory(); + + std::vector> ret; + for (size_t i = 0; i < counters_.names().size(); ++i) { + double measurement = static_cast(end_values_[i]) - + static_cast(start_values_[i]); + ret.push_back({counters_.names()[i], measurement}); + } + return ret; + } + + private: + PerfCounters counters_; + PerfCounterValues start_values_; + PerfCounterValues end_values_; +}; + +BENCHMARK_UNUSED static bool perf_init_anchor = PerfCounters::Initialize(); + +} // namespace internal +} // namespace benchmark + +#endif // BENCHMARK_PERF_COUNTERS_H diff --git a/src/string_util.cc b/src/string_util.cc index ac60b558..53b1532b 100644 --- a/src/string_util.cc +++ b/src/string_util.cc @@ -163,6 +163,18 @@ std::string StrFormat(const char* format, ...) { return tmp; } +std::vector StrSplit(const std::string& str, char delim) { + std::vector ret; + size_t first = 0; + size_t next = str.find(delim); + for (; next != std::string::npos; + first = next + 1, next = str.find(first, delim)) { + ret.push_back(str.substr(first, next - first)); + } + ret.push_back(str.substr(first)); + return ret; +} + #ifdef BENCHMARK_STL_ANDROID_GNUSTL /* * GNU STL in Android NDK lacks support for some C++11 functions, including diff --git a/src/string_util.h b/src/string_util.h index 09d7b4bd..6bc28b69 100644 --- a/src/string_util.h +++ b/src/string_util.h @@ -37,6 +37,8 @@ inline std::string StrCat(Args&&... args) { return ss.str(); } +std::vector StrSplit(const std::string& str, char delim); + #ifdef BENCHMARK_STL_ANDROID_GNUSTL /* * GNU STL in Android NDK lacks support for some C++11 functions, including diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index c1a3a3fc..1e7b6829 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -128,6 +128,9 @@ add_test(NAME templated_fixture_test COMMAND templated_fixture_test --benchmark_ compile_output_test(user_counters_test) add_test(NAME user_counters_test COMMAND user_counters_test --benchmark_min_time=0.01) +compile_output_test(perf_counters_test) +add_test(NAME perf_counters_test COMMAND perf_counters_test --benchmark_min_time=0.01 --benchmark_perf_counters=CYCLES,BRANCHES) + compile_output_test(internal_threading_test) add_test(NAME internal_threading_test COMMAND internal_threading_test --benchmark_min_time=0.01) @@ -196,6 +199,7 @@ if (BENCHMARK_ENABLE_GTEST_TESTS) add_gtest(commandlineflags_gtest) add_gtest(statistics_gtest) add_gtest(string_util_gtest) + add_gtest(perf_counters_gtest) endif(BENCHMARK_ENABLE_GTEST_TESTS) ############################################################################### diff --git a/test/perf_counters_gtest.cc b/test/perf_counters_gtest.cc new file mode 100644 index 00000000..47894af4 --- /dev/null +++ b/test/perf_counters_gtest.cc @@ -0,0 +1,95 @@ +#include "../src/perf_counters.h" +#include "gtest/gtest.h" + +#ifndef GTEST_SKIP +struct MsgHandler { + void operator=(std::ostream&){} +}; +#define GTEST_SKIP() return MsgHandler() = std::cout +#endif + +using benchmark::internal::PerfCounters; +using benchmark::internal::PerfCounterValues; + +namespace { +const char kGenericPerfEvent1[] = "CYCLES"; +const char kGenericPerfEvent2[] = "BRANCHES"; +const char kGenericPerfEvent3[] = "INSTRUCTIONS"; + +TEST(PerfCountersTest, Init) { + EXPECT_EQ(PerfCounters::Initialize(), PerfCounters::kSupported); +} + +TEST(PerfCountersTest, OneCounter) { + if (!PerfCounters::kSupported) { + GTEST_SKIP() << "Performance counters not supported.\n"; + } + EXPECT_TRUE(PerfCounters::Initialize()); + EXPECT_TRUE(PerfCounters::Create({kGenericPerfEvent1}).IsValid()); +} + +TEST(PerfCountersTest, NegativeTest) { + if (!PerfCounters::kSupported) { + EXPECT_FALSE(PerfCounters::Initialize()); + return; + } + EXPECT_TRUE(PerfCounters::Initialize()); + EXPECT_FALSE(PerfCounters::Create({}).IsValid()); + EXPECT_FALSE(PerfCounters::Create({""}).IsValid()); + EXPECT_FALSE(PerfCounters::Create({"not a counter name"}).IsValid()); + { + EXPECT_TRUE(PerfCounters::Create({kGenericPerfEvent1, kGenericPerfEvent2, + kGenericPerfEvent3}) + .IsValid()); + } + EXPECT_FALSE( + PerfCounters::Create({kGenericPerfEvent2, "", kGenericPerfEvent1}) + .IsValid()); + EXPECT_FALSE(PerfCounters::Create({kGenericPerfEvent3, "not a counter name", + kGenericPerfEvent1}) + .IsValid()); + { + EXPECT_TRUE(PerfCounters::Create({kGenericPerfEvent1, kGenericPerfEvent2, + kGenericPerfEvent3}) + .IsValid()); + } + EXPECT_FALSE( + PerfCounters::Create({kGenericPerfEvent1, kGenericPerfEvent2, + kGenericPerfEvent3, "MISPREDICTED_BRANCH_RETIRED"}) + .IsValid()); +} + +TEST(PerfCountersTest, Read1Counter) { + if (!PerfCounters::kSupported) { + GTEST_SKIP() << "Test skipped because libpfm is not supported.\n"; + } + EXPECT_TRUE(PerfCounters::Initialize()); + auto counters = PerfCounters::Create({kGenericPerfEvent1}); + EXPECT_TRUE(counters.IsValid()); + PerfCounterValues values1(1); + EXPECT_TRUE(counters.Snapshot(&values1)); + EXPECT_GT(values1[0], 0); + PerfCounterValues values2(1); + EXPECT_TRUE(counters.Snapshot(&values2)); + EXPECT_GT(values2[0], 0); + EXPECT_GT(values2[0], values1[0]); +} + +TEST(PerfCountersTest, Read2Counters) { + if (!PerfCounters::kSupported) { + GTEST_SKIP() << "Test skipped because libpfm is not supported.\n"; + } + EXPECT_TRUE(PerfCounters::Initialize()); + auto counters = + PerfCounters::Create({kGenericPerfEvent1, kGenericPerfEvent2}); + EXPECT_TRUE(counters.IsValid()); + PerfCounterValues values1(2); + EXPECT_TRUE(counters.Snapshot(&values1)); + EXPECT_GT(values1[0], 0); + EXPECT_GT(values1[1], 0); + PerfCounterValues values2(2); + EXPECT_TRUE(counters.Snapshot(&values2)); + EXPECT_GT(values2[0], 0); + EXPECT_GT(values2[1], 0); +} +} // namespace diff --git a/test/perf_counters_test.cc b/test/perf_counters_test.cc new file mode 100644 index 00000000..d6e0284d --- /dev/null +++ b/test/perf_counters_test.cc @@ -0,0 +1,27 @@ +#undef NDEBUG + +#include "../src/perf_counters.h" + +#include "benchmark/benchmark.h" +#include "output_test.h" + +void BM_Simple(benchmark::State& state) { + for (auto _ : state) { + benchmark::DoNotOptimize(state.iterations()); + } +} +BENCHMARK(BM_Simple); +ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Simple\",$"}}); + +void CheckSimple(Results const& e) { + CHECK_COUNTER_VALUE(e, double, "CYCLES", GT, 0); + CHECK_COUNTER_VALUE(e, double, "BRANCHES", GT, 0.0); +} +CHECK_BENCHMARK_RESULTS("BM_Simple", &CheckSimple); + +int main(int argc, char* argv[]) { + if (!benchmark::internal::PerfCounters::kSupported) { + return 0; + } + RunOutputTests(argc, argv); +} diff --git a/test/string_util_gtest.cc b/test/string_util_gtest.cc index 01bf155d..77a719a6 100644 --- a/test/string_util_gtest.cc +++ b/test/string_util_gtest.cc @@ -150,4 +150,12 @@ TEST(StringUtilTest, stod) { #endif } +TEST(StringUtilTest, StrSplit) { + EXPECT_EQ(benchmark::StrSplit("", ','), std::vector{""}); + EXPECT_EQ(benchmark::StrSplit("hello", ','), + std::vector({"hello"})); + EXPECT_EQ(benchmark::StrSplit("hello,there", ','), + std::vector({"hello", "there"})); +} + } // end namespace