diff --git a/include/benchmark/benchmark.h b/include/benchmark/benchmark.h index c154a157..ad7c92e9 100644 --- a/include/benchmark/benchmark.h +++ b/include/benchmark/benchmark.h @@ -218,6 +218,18 @@ BENCHMARK(BM_test)->Unit(benchmark::kMillisecond); #define BENCHMARK_UNUSED #endif +// Used to annotate functions, methods and classes so they +// are not optimized by the compiler. Useful for tests +// where you expect loops to stay in place churning cycles +#if defined(__clang__) +#define BENCHMARK_DONT_OPTIMIZE __attribute__((optnone)) +#elif defined(__GNUC__) || defined(__GNUG__) +#define BENCHMARK_DONT_OPTIMIZE __attribute__((optimize(0))) +#else +// MSVC & Intel do not have a no-optimize attribute, only line pragmas +#define BENCHMARK_DONT_OPTIMIZE +#endif + #if defined(__GNUC__) || defined(__clang__) #define BENCHMARK_ALWAYS_INLINE __attribute__((always_inline)) #elif defined(_MSC_VER) && !defined(__clang__) diff --git a/src/benchmark.cc b/src/benchmark.cc index e2d85fe4..b8eda008 100644 --- a/src/benchmark.cc +++ b/src/benchmark.cc @@ -348,14 +348,26 @@ void RunBenchmarks(const std::vector& benchmarks, size_t num_repetitions_total = 0; + // This perfcounters object needs to be created before the runners vector + // below so it outlasts their lifetime. + PerfCountersMeasurement perfcounters( + StrSplit(FLAGS_benchmark_perf_counters, ',')); + + // Vector of benchmarks to run std::vector runners; runners.reserve(benchmarks.size()); + + // Count the number of benchmarks with threads to warn the user in case + // performance counters are used. + int benchmarks_with_threads = 0; + + // Loop through all benchmarks for (const BenchmarkInstance& benchmark : benchmarks) { BenchmarkReporter::PerFamilyRunReports* reports_for_family = nullptr; if (benchmark.complexity() != oNone) reports_for_family = &per_family_reports[benchmark.family_index()]; - - runners.emplace_back(benchmark, reports_for_family); + benchmarks_with_threads += (benchmark.threads() > 0); + runners.emplace_back(benchmark, &perfcounters, reports_for_family); int num_repeats_of_this_instance = runners.back().GetNumRepeats(); num_repetitions_total += num_repeats_of_this_instance; if (reports_for_family) @@ -363,6 +375,17 @@ void RunBenchmarks(const std::vector& benchmarks, } assert(runners.size() == benchmarks.size() && "Unexpected runner count."); + // The use of performance counters with threads would be unintuitive for + // the average user so we need to warn them about this case + if ((benchmarks_with_threads > 0) && (perfcounters.num_counters() > 0)) { + GetErrorLogInstance() + << "***WARNING*** There are " << benchmarks_with_threads + << " benchmarks with threads and " << perfcounters.num_counters() + << " performance counters were requested. Beware counters will " + "reflect the combined usage across all " + "threads.\n"; + } + std::vector repetition_indices; repetition_indices.reserve(num_repetitions_total); for (size_t runner_index = 0, num_runners = runners.size(); diff --git a/src/benchmark_runner.cc b/src/benchmark_runner.cc index 09975a9e..58147ca7 100644 --- a/src/benchmark_runner.cc +++ b/src/benchmark_runner.cc @@ -221,6 +221,7 @@ BenchTimeType ParseBenchMinTime(const std::string& value) { BenchmarkRunner::BenchmarkRunner( const benchmark::internal::BenchmarkInstance& b_, + PerfCountersMeasurement* pcm_, BenchmarkReporter::PerFamilyRunReports* reports_for_family_) : b(b_), reports_for_family(reports_for_family_), @@ -239,10 +240,7 @@ BenchmarkRunner::BenchmarkRunner( iters(has_explicit_iteration_count ? ComputeIters(b_, parsed_benchtime_flag) : 1), - perf_counters_measurement(StrSplit(FLAGS_benchmark_perf_counters, ',')), - perf_counters_measurement_ptr(perf_counters_measurement.IsValid() - ? &perf_counters_measurement - : nullptr) { + perf_counters_measurement_ptr(pcm_) { run_results.display_report_aggregates_only = (FLAGS_benchmark_report_aggregates_only || FLAGS_benchmark_display_aggregates_only); @@ -255,7 +253,7 @@ BenchmarkRunner::BenchmarkRunner( run_results.file_report_aggregates_only = (b.aggregation_report_mode() & internal::ARM_FileReportAggregatesOnly); BM_CHECK(FLAGS_benchmark_perf_counters.empty() || - perf_counters_measurement.IsValid()) + (perf_counters_measurement_ptr->num_counters() == 0)) << "Perf counters were requested but could not be set up."; } } diff --git a/src/benchmark_runner.h b/src/benchmark_runner.h index 9d806537..db2fa043 100644 --- a/src/benchmark_runner.h +++ b/src/benchmark_runner.h @@ -58,6 +58,7 @@ BenchTimeType ParseBenchMinTime(const std::string& value); class BenchmarkRunner { public: BenchmarkRunner(const benchmark::internal::BenchmarkInstance& b_, + benchmark::internal::PerfCountersMeasurement* pmc_, BenchmarkReporter::PerFamilyRunReports* reports_for_family); int GetNumRepeats() const { return repeats; } @@ -103,8 +104,7 @@ class BenchmarkRunner { // So only the first repetition has to find/calculate it, // the other repetitions will just use that precomputed iteration count. - PerfCountersMeasurement perf_counters_measurement; - PerfCountersMeasurement* const perf_counters_measurement_ptr; + PerfCountersMeasurement* const perf_counters_measurement_ptr = nullptr; struct IterationResults { internal::ThreadManager::Result results; diff --git a/src/perf_counters.cc b/src/perf_counters.cc index 38b73ae4..3980ea05 100644 --- a/src/perf_counters.cc +++ b/src/perf_counters.cc @@ -71,80 +71,78 @@ bool PerfCounters::IsCounterSupported(const std::string& name) { return (ret == PFM_SUCCESS); } -// Validates all counter names passed, returning only the valid ones -static std::vector validateCounters( - const std::vector& counter_names) { - // All valid names to be returned - std::vector valid_names; - - // Loop through all the given names - int invalid_counter = 0; - for (const std::string& name : counter_names) { - // Check trivial empty - if (name.empty()) { - GetErrorLogInstance() << "A counter name was the empty string\n"; - invalid_counter++; - continue; - } - if (PerfCounters::IsCounterSupported(name)) { - // we are about to push into the valid names vector - // check if we did not reach the maximum - if (valid_names.size() == PerfCounterValues::kMaxCounters) { - GetErrorLogInstance() - << counter_names.size() - << " counters were requested. The maximum is " - << PerfCounterValues::kMaxCounters << " and " - << counter_names.size() - invalid_counter - valid_names.size() - << " will be ignored\n"; - // stop the loop and return what we have already - break; - } - valid_names.push_back(name); - } else { - GetErrorLogInstance() << "Performance counter " << name - << " incorrect or not supported on this platform\n"; - invalid_counter++; - } - } - // RVO should take care of this - return valid_names; -} - PerfCounters PerfCounters::Create( const std::vector& counter_names) { - std::vector valid_names = validateCounters(counter_names); - if (valid_names.empty()) { - return NoCounters(); - } - std::vector counter_ids(valid_names.size()); + // Valid counters will populate these arrays but we start empty + std::vector valid_names; + std::vector counter_ids; std::vector leader_ids; - const int mode = PFM_PLM3; // user mode only + // Resize to the maximum possible + valid_names.reserve(counter_names.size()); + counter_ids.reserve(counter_names.size()); + + const int kCounterMode = PFM_PLM3; // user mode only + + // Group leads will be assigned on demand. The idea is that once we cannot + // create a counter descriptor, the reason is that this group has maxed out + // so we set the group_id again to -1 and retry - giving the algorithm a + // chance to create a new group leader to hold the next set of counters. int group_id = -1; - for (size_t i = 0; i < valid_names.size(); ++i) { + + // Loop through all performance counters + for (size_t i = 0; i < counter_names.size(); ++i) { + // we are about to push into the valid names vector + // check if we did not reach the maximum + if (valid_names.size() == PerfCounterValues::kMaxCounters) { + // Log a message if we maxed out and stop adding + GetErrorLogInstance() + << counter_names.size() << " counters were requested. The maximum is " + << PerfCounterValues::kMaxCounters << " and " << valid_names.size() + << " were already added. All remaining counters will be ignored\n"; + // stop the loop and return what we have already + break; + } + + // Check if this name is empty + const auto& name = counter_names[i]; + if (name.empty()) { + GetErrorLogInstance() + << "A performance counter name was the empty string\n"; + continue; + } + + // Here first means first in group, ie the group leader const bool is_first = (group_id < 0); + + // This struct will be populated by libpfm from the counter string + // and then fed into the syscall perf_event_open struct perf_event_attr attr {}; attr.size = sizeof(attr); - const auto& name = valid_names[i]; + + // This is the input struct to libpfm. pfm_perf_encode_arg_t arg{}; arg.attr = &attr; - - const int pfm_get = - pfm_get_os_event_encoding(name.c_str(), mode, PFM_OS_PERF_EVENT, &arg); + const int pfm_get = pfm_get_os_event_encoding(name.c_str(), kCounterMode, + PFM_OS_PERF_EVENT, &arg); if (pfm_get != PFM_SUCCESS) { - GetErrorLogInstance() << "Unknown counter name: " << name << "\n"; - return NoCounters(); + GetErrorLogInstance() + << "Unknown performance counter name: " << name << "\n"; + continue; } - attr.disabled = is_first; + + // We then proceed to populate the remaining fields in our attribute struct // Note: the man page for perf_event_create suggests inherit = true and // read_format = PERF_FORMAT_GROUP don't work together, but that's not the // case. + attr.disabled = is_first; attr.inherit = true; attr.pinned = is_first; attr.exclude_kernel = true; attr.exclude_user = false; attr.exclude_hv = true; - // Read all counters in one read. + + // Read all counters in a group in one read. attr.read_format = PERF_FORMAT_GROUP; int id = -1; @@ -159,36 +157,64 @@ PerfCounters PerfCounters::Create( } } if (id < 0) { - // We reached a limit perhaps? + // If the file descriptor is negative we might have reached a limit + // in the current group. Set the group_id to -1 and retry if (group_id >= 0) { // Create a new group group_id = -1; } else { - // Give up, there is nothing else to try + // At this point we have already retried to set a new group id and + // failed. We then give up. break; } } } + + // We failed to get a new file descriptor. We might have reached a hard + // hardware limit that cannot be resolved even with group multiplexing if (id < 0) { - GetErrorLogInstance() - << "Failed to get a file descriptor for " << name << "\n"; - return NoCounters(); + GetErrorLogInstance() << "***WARNING** Failed to get a file descriptor " + "for performance counter " + << name << ". Ignoring\n"; + + // We give up on this counter but try to keep going + // as the others would be fine + continue; } if (group_id < 0) { - // This is a leader, store and assign it + // This is a leader, store and assign it to the current file descriptor leader_ids.push_back(id); group_id = id; } - counter_ids[i] = id; + // This is a valid counter, add it to our descriptor's list + counter_ids.push_back(id); + valid_names.push_back(name); } + + // Loop through all group leaders activating them + // There is another option of starting ALL counters in a process but + // that would be far reaching an intrusion. If the user is using PMCs + // by themselves then this would have a side effect on them. It is + // friendlier to loop through all groups individually. for (int lead : leader_ids) { if (ioctl(lead, PERF_EVENT_IOC_ENABLE) != 0) { - GetErrorLogInstance() << "Failed to start counters\n"; + // This should never happen but if it does, we give up on the + // entire batch as recovery would be a mess. + GetErrorLogInstance() << "***WARNING*** Failed to start counters. " + "Claring out all counters.\n"; + + // Close all peformance counters + for (int id : counter_ids) { + ::close(id); + } + + // Return an empty object so our internal state is still good and + // the process can continue normally without impact return NoCounters(); } } - return PerfCounters(valid_names, std::move(counter_ids), + return PerfCounters(std::move(valid_names), std::move(counter_ids), std::move(leader_ids)); } @@ -223,34 +249,10 @@ PerfCounters PerfCounters::Create( void PerfCounters::CloseCounters() const {} #endif // defined HAVE_LIBPFM -Mutex PerfCountersMeasurement::mutex_; -int PerfCountersMeasurement::ref_count_ = 0; -PerfCounters PerfCountersMeasurement::counters_ = PerfCounters::NoCounters(); - -// The validation in PerfCounter::Create will create less counters than passed -// so it should be okay to initialize start_values_ and end_values_ with the -// upper bound as passed PerfCountersMeasurement::PerfCountersMeasurement( const std::vector& counter_names) : start_values_(counter_names.size()), end_values_(counter_names.size()) { - MutexLock l(mutex_); - if (ref_count_ == 0) { - counters_ = PerfCounters::Create(counter_names); - } - // We chose to increment it even if `counters_` ends up invalid, - // so that we don't keep trying to create, and also since the dtor - // will decrement regardless of `counters_`'s validity - ++ref_count_; - - BM_CHECK(!counters_.IsValid() || counters_.names() == counter_names); -} - -PerfCountersMeasurement::~PerfCountersMeasurement() { - MutexLock l(mutex_); - --ref_count_; - if (ref_count_ == 0) { - counters_ = PerfCounters::NoCounters(); - } + counters_ = PerfCounters::Create(counter_names); } PerfCounters& PerfCounters::operator=(PerfCounters&& other) noexcept { diff --git a/src/perf_counters.h b/src/perf_counters.h index aeea350d..152a6f25 100644 --- a/src/perf_counters.h +++ b/src/perf_counters.h @@ -90,10 +90,11 @@ class BENCHMARK_EXPORT PerfCounters final { // True iff this platform supports performance counters. static const bool kSupported; - bool IsValid() const { return !counter_names_.empty(); } + // Returns an empty object static PerfCounters NoCounters() { return PerfCounters(); } ~PerfCounters() { CloseCounters(); } + PerfCounters() = default; PerfCounters(PerfCounters&&) = default; PerfCounters(const PerfCounters&) = delete; PerfCounters& operator=(PerfCounters&&) noexcept; @@ -110,8 +111,8 @@ class BENCHMARK_EXPORT PerfCounters final { // Return a PerfCounters object ready to read the counters with the names // specified. The values are user-mode only. The counter name format is // implementation and OS specific. - // TODO: once we move to C++-17, this should be a std::optional, and then the - // IsValid() boolean can be dropped. + // In case of failure, this method will in the worst case return an + // empty object whose state will still be valid. static PerfCounters Create(const std::vector& counter_names); // Take a snapshot of the current value of the counters into the provided @@ -120,7 +121,6 @@ class BENCHMARK_EXPORT PerfCounters final { BENCHMARK_ALWAYS_INLINE bool Snapshot(PerfCounterValues* values) const { #ifndef BENCHMARK_OS_WINDOWS assert(values != nullptr); - assert(IsValid()); return values->Read(leader_ids_) == counter_ids_.size(); #else (void)values; @@ -137,7 +137,6 @@ class BENCHMARK_EXPORT PerfCounters final { : counter_ids_(std::move(counter_ids)), leader_ids_(std::move(leader_ids)), counter_names_(counter_names) {} - PerfCounters() = default; void CloseCounters() const; @@ -150,33 +149,25 @@ class BENCHMARK_EXPORT PerfCounters final { class BENCHMARK_EXPORT PerfCountersMeasurement final { public: PerfCountersMeasurement(const std::vector& counter_names); - ~PerfCountersMeasurement(); - // The only way to get to `counters_` is after ctor-ing a - // `PerfCountersMeasurement`, which means that `counters_`'s state is, here, - // decided (either invalid or valid) and won't change again even if a ctor is - // concurrently running with this. This is preferring efficiency to - // maintainability, because the address of the static can be known at compile - // time. - bool IsValid() const { - MutexLock l(mutex_); - return counters_.IsValid(); - } + size_t num_counters() const { return counters_.num_counters(); } - BENCHMARK_ALWAYS_INLINE void Start() { - assert(IsValid()); - MutexLock l(mutex_); + std::vector names() const { return counters_.names(); } + + BENCHMARK_ALWAYS_INLINE bool Start() { + if (num_counters() == 0) return true; // Tell the compiler to not move instructions above/below where we take // the snapshot. ClobberMemory(); valid_read_ &= counters_.Snapshot(&start_values_); ClobberMemory(); + + return valid_read_; } BENCHMARK_ALWAYS_INLINE bool Stop( std::vector>& measurements) { - assert(IsValid()); - MutexLock l(mutex_); + if (num_counters() == 0) return true; // Tell the compiler to not move instructions above/below where we take // the snapshot. ClobberMemory(); @@ -193,9 +184,7 @@ class BENCHMARK_EXPORT PerfCountersMeasurement final { } private: - static Mutex mutex_; - GUARDED_BY(mutex_) static int ref_count_; - GUARDED_BY(mutex_) static PerfCounters counters_; + PerfCounters counters_; bool valid_read_ = true; PerfCounterValues start_values_; PerfCounterValues end_values_; diff --git a/test/perf_counters_gtest.cc b/test/perf_counters_gtest.cc index e31758de..e73ebc58 100644 --- a/test/perf_counters_gtest.cc +++ b/test/perf_counters_gtest.cc @@ -1,3 +1,4 @@ +#include #include #include "../src/perf_counters.h" @@ -28,7 +29,7 @@ TEST(PerfCountersTest, OneCounter) { GTEST_SKIP() << "Performance counters not supported.\n"; } EXPECT_TRUE(PerfCounters::Initialize()); - EXPECT_TRUE(PerfCounters::Create({kGenericPerfEvent1}).IsValid()); + EXPECT_EQ(PerfCounters::Create({kGenericPerfEvent1}).num_counters(), 1); } TEST(PerfCountersTest, NegativeTest) { @@ -37,38 +38,42 @@ TEST(PerfCountersTest, NegativeTest) { return; } EXPECT_TRUE(PerfCounters::Initialize()); - EXPECT_FALSE(PerfCounters::Create({}).IsValid()); - EXPECT_FALSE(PerfCounters::Create({""}).IsValid()); - EXPECT_FALSE(PerfCounters::Create({"not a counter name"}).IsValid()); - EXPECT_TRUE(PerfCounters::Create( - {kGenericPerfEvent1, kGenericPerfEvent2, kGenericPerfEvent3}) - .IsValid()); + // Sanity checks + // Create() will always create a valid object, even if passed no or + // wrong arguments as the new behavior is to warn and drop unsupported + // counters + EXPECT_EQ(PerfCounters::Create({}).num_counters(), 0); + EXPECT_EQ(PerfCounters::Create({""}).num_counters(), 0); + EXPECT_EQ(PerfCounters::Create({"not a counter name"}).num_counters(), 0); { + // Try sneaking in a bad egg to see if it is filtered out. The + // number of counters has to be two, not zero auto counter = PerfCounters::Create({kGenericPerfEvent2, "", kGenericPerfEvent1}); - EXPECT_TRUE(counter.IsValid()); EXPECT_EQ(counter.num_counters(), 2); EXPECT_EQ(counter.names(), std::vector( {kGenericPerfEvent2, kGenericPerfEvent1})); } { + // Try sneaking in an outrageous counter, like a fat finger mistake auto counter = PerfCounters::Create( {kGenericPerfEvent3, "not a counter name", kGenericPerfEvent1}); - EXPECT_TRUE(counter.IsValid()); EXPECT_EQ(counter.num_counters(), 2); EXPECT_EQ(counter.names(), std::vector( {kGenericPerfEvent3, kGenericPerfEvent1})); } { - EXPECT_TRUE(PerfCounters::Create({kGenericPerfEvent1, kGenericPerfEvent2, - kGenericPerfEvent3}) - .IsValid()); + // Finally try a golden input - it should like all them + EXPECT_EQ(PerfCounters::Create( + {kGenericPerfEvent1, kGenericPerfEvent2, kGenericPerfEvent3}) + .num_counters(), + 3); } { + // Add a bad apple in the end of the chain to check the edges auto counter = PerfCounters::Create({kGenericPerfEvent1, kGenericPerfEvent2, kGenericPerfEvent3, "MISPREDICTED_BRANCH_RETIRED"}); - EXPECT_TRUE(counter.IsValid()); EXPECT_EQ(counter.num_counters(), 3); EXPECT_EQ(counter.names(), std::vector({kGenericPerfEvent1, kGenericPerfEvent2, @@ -82,7 +87,7 @@ TEST(PerfCountersTest, Read1Counter) { } EXPECT_TRUE(PerfCounters::Initialize()); auto counters = PerfCounters::Create({kGenericPerfEvent1}); - EXPECT_TRUE(counters.IsValid()); + EXPECT_EQ(counters.num_counters(), 1); PerfCounterValues values1(1); EXPECT_TRUE(counters.Snapshot(&values1)); EXPECT_GT(values1[0], 0); @@ -99,7 +104,7 @@ TEST(PerfCountersTest, Read2Counters) { EXPECT_TRUE(PerfCounters::Initialize()); auto counters = PerfCounters::Create({kGenericPerfEvent1, kGenericPerfEvent2}); - EXPECT_TRUE(counters.IsValid()); + EXPECT_EQ(counters.num_counters(), 2); PerfCounterValues values1(2); EXPECT_TRUE(counters.Snapshot(&values1)); EXPECT_GT(values1[0], 0); @@ -111,62 +116,107 @@ TEST(PerfCountersTest, Read2Counters) { } TEST(PerfCountersTest, ReopenExistingCounters) { - // The test works (i.e. causes read to fail) for the assumptions - // about hardware capabilities (i.e. small number (3-4) hardware - // counters) at this date. + // This test works in recent and old Intel hardware + // However we cannot make assumptions beyond 3 HW counters if (!PerfCounters::kSupported) { GTEST_SKIP() << "Test skipped because libpfm is not supported.\n"; } EXPECT_TRUE(PerfCounters::Initialize()); - std::vector counters; - counters.reserve(6); - for (int i = 0; i < 6; i++) - counters.push_back(PerfCounters::Create({kGenericPerfEvent1})); + std::vector kMetrics({kGenericPerfEvent1}); + std::vector counters(3); + for (auto& counter : counters) { + counter = PerfCounters::Create(kMetrics); + } PerfCounterValues values(1); EXPECT_TRUE(counters[0].Snapshot(&values)); - EXPECT_FALSE(counters[4].Snapshot(&values)); - EXPECT_FALSE(counters[5].Snapshot(&values)); + EXPECT_TRUE(counters[1].Snapshot(&values)); + EXPECT_TRUE(counters[2].Snapshot(&values)); } TEST(PerfCountersTest, CreateExistingMeasurements) { // The test works (i.e. causes read to fail) for the assumptions - // about hardware capabilities (i.e. small number (3-4) hardware + // about hardware capabilities (i.e. small number (3) hardware // counters) at this date, // the same as previous test ReopenExistingCounters. if (!PerfCounters::kSupported) { GTEST_SKIP() << "Test skipped because libpfm is not supported.\n"; } EXPECT_TRUE(PerfCounters::Initialize()); - std::vector perf_counter_measurements; + + // This means we will try 10 counters but we can only guarantee + // for sure at this time that only 3 will work. Perhaps in the future + // we could use libpfm to query for the hardware limits on this + // particular platform. + const int kMaxCounters = 10; + const int kMinValidCounters = 3; + + // Let's use a ubiquitous counter that is guaranteed to work + // on all platforms + const std::vector kMetrics{"cycles"}; + + // Cannot create a vector of actual objects because the + // copy constructor of PerfCounters is deleted - and so is + // implicitly deleted on PerfCountersMeasurement too + std::vector> + perf_counter_measurements; + + perf_counter_measurements.reserve(kMaxCounters); + for (int j = 0; j < kMaxCounters; ++j) { + perf_counter_measurements.emplace_back( + new PerfCountersMeasurement(kMetrics)); + } + std::vector> measurements; - perf_counter_measurements.reserve(10); - for (int i = 0; i < 10; i++) - perf_counter_measurements.emplace_back( - std::vector{kGenericPerfEvent1}); + // Start all counters together to see if they hold + int max_counters = kMaxCounters; + for (int i = 0; i < kMaxCounters; ++i) { + auto& counter(*perf_counter_measurements[i]); + EXPECT_EQ(counter.num_counters(), 1); + if (!counter.Start()) { + max_counters = i; + break; + }; + } - perf_counter_measurements[0].Start(); - EXPECT_TRUE(perf_counter_measurements[0].Stop(measurements)); + ASSERT_GE(max_counters, kMinValidCounters); - measurements.clear(); - perf_counter_measurements[8].Start(); - EXPECT_FALSE(perf_counter_measurements[8].Stop(measurements)); + // Start all together + for (int i = 0; i < max_counters; ++i) { + auto& counter(*perf_counter_measurements[i]); + EXPECT_TRUE(counter.Stop(measurements) || (i >= kMinValidCounters)); + } - measurements.clear(); - perf_counter_measurements[9].Start(); - EXPECT_FALSE(perf_counter_measurements[9].Stop(measurements)); + // Start/stop individually + for (int i = 0; i < max_counters; ++i) { + auto& counter(*perf_counter_measurements[i]); + measurements.clear(); + counter.Start(); + EXPECT_TRUE(counter.Stop(measurements) || (i >= kMinValidCounters)); + } } -size_t do_work() { - size_t res = 0; - for (size_t i = 0; i < 100000000; ++i) res += i * i; - return res; +// We try to do some meaningful work here but the compiler +// insists in optimizing away our loop so we had to add a +// no-optimize macro. In case it fails, we added some entropy +// to this pool as well. + +BENCHMARK_DONT_OPTIMIZE size_t do_work() { + static std::mt19937 rd{std::random_device{}()}; + static std::uniform_int_distribution mrand(0, 10); + const size_t kNumLoops = 1000000; + size_t sum = 0; + for (size_t j = 0; j < kNumLoops; ++j) { + sum += mrand(rd); + } + benchmark::DoNotOptimize(sum); + return sum; } -void measure(size_t threadcount, PerfCounterValues* values1, - PerfCounterValues* values2) { - BM_CHECK_NE(values1, nullptr); - BM_CHECK_NE(values2, nullptr); +void measure(size_t threadcount, PerfCounterValues* before, + PerfCounterValues* after) { + BM_CHECK_NE(before, nullptr); + BM_CHECK_NE(after, nullptr); std::vector threads(threadcount); auto work = [&]() { BM_CHECK(do_work() > 1000); }; @@ -178,9 +228,9 @@ void measure(size_t threadcount, PerfCounterValues* values1, auto counters = PerfCounters::Create({kGenericPerfEvent1, kGenericPerfEvent3}); for (auto& t : threads) t = std::thread(work); - counters.Snapshot(values1); + counters.Snapshot(before); for (auto& t : threads) t.join(); - counters.Snapshot(values2); + counters.Snapshot(after); } TEST(PerfCountersTest, MultiThreaded) { @@ -188,21 +238,29 @@ TEST(PerfCountersTest, MultiThreaded) { GTEST_SKIP() << "Test skipped because libpfm is not supported."; } EXPECT_TRUE(PerfCounters::Initialize()); - PerfCounterValues values1(2); - PerfCounterValues values2(2); + PerfCounterValues before(2); + PerfCounterValues after(2); - measure(2, &values1, &values2); - std::vector D1{static_cast(values2[0] - values1[0]), - static_cast(values2[1] - values1[1])}; + // Notice that this test will work even if we taskset it to a single CPU + // In this case the threads will run sequentially + // Start two threads and measure the number of combined cycles and + // instructions + measure(2, &before, &after); + std::vector Elapsed2Threads{ + static_cast(after[0] - before[0]), + static_cast(after[1] - before[1])}; - measure(4, &values1, &values2); - std::vector D2{static_cast(values2[0] - values1[0]), - static_cast(values2[1] - values1[1])}; + // Start four threads and measure the number of combined cycles and + // instructions + measure(4, &before, &after); + std::vector Elapsed4Threads{ + static_cast(after[0] - before[0]), + static_cast(after[1] - before[1])}; // Some extra work will happen on the main thread - like joining the threads // - so the ratio won't be quite 2.0, but very close. - EXPECT_GE(D2[0], 1.9 * D1[0]); - EXPECT_GE(D2[1], 1.9 * D1[1]); + EXPECT_GE(Elapsed4Threads[0], 1.9 * Elapsed2Threads[0]); + EXPECT_GE(Elapsed4Threads[1], 1.9 * Elapsed2Threads[1]); } TEST(PerfCountersTest, HardwareLimits) {