Refactoring of PerfCounters infrastructure (#1559)

* Refactoring of PerfCounters infrastructure

The main feature in this pull request is the removal of the static
sharing of PerfCounters and instead creating them at the top
`RunBenchmarks()`  function where all benchmark runners are created. A
single PerfCountersMeasurement object is created and then shared with
all the new BenchmarkRunners objects, one per existing benchmark.

Other features conflated here in this PR are:
- Added BENCHMARK_DONT_OPTIMIZE macro in global scope
- Removal of the `IsValid()` query, being replaced by checking the
  number of remaining counters after validity tests
- Refactoring of all GTests to reflect the changes and new semantics
- extra comments throughout the new code to clarify intent

It was extremely hard to separate all those features in different PRs
as requested since they are so interdependent on each other so I'm just
pushing them altogether and asking for forgiveness.

This PR comes replacing PRs 1555 and 1558 which have been closed.

* Fixed whitespace issue with clang-format

My clang-format insists in deleting this single white space on line 601
while Github's clang format breaks when it is added. I had to disable
format-on-save to check-in this revert change.
I'm using clang 14.0.6.
This commit is contained in:
Henrique Bucher 2023-03-07 04:27:52 -06:00 committed by GitHub
parent 9885aefb96
commit fbc6efa9b5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 261 additions and 179 deletions

View File

@ -218,6 +218,18 @@ BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
#define BENCHMARK_UNUSED
#endif
// Used to annotate functions, methods and classes so they
// are not optimized by the compiler. Useful for tests
// where you expect loops to stay in place churning cycles
#if defined(__clang__)
#define BENCHMARK_DONT_OPTIMIZE __attribute__((optnone))
#elif defined(__GNUC__) || defined(__GNUG__)
#define BENCHMARK_DONT_OPTIMIZE __attribute__((optimize(0)))
#else
// MSVC & Intel do not have a no-optimize attribute, only line pragmas
#define BENCHMARK_DONT_OPTIMIZE
#endif
#if defined(__GNUC__) || defined(__clang__)
#define BENCHMARK_ALWAYS_INLINE __attribute__((always_inline))
#elif defined(_MSC_VER) && !defined(__clang__)

View File

@ -348,14 +348,26 @@ void RunBenchmarks(const std::vector<BenchmarkInstance>& benchmarks,
size_t num_repetitions_total = 0;
// This perfcounters object needs to be created before the runners vector
// below so it outlasts their lifetime.
PerfCountersMeasurement perfcounters(
StrSplit(FLAGS_benchmark_perf_counters, ','));
// Vector of benchmarks to run
std::vector<internal::BenchmarkRunner> runners;
runners.reserve(benchmarks.size());
// Count the number of benchmarks with threads to warn the user in case
// performance counters are used.
int benchmarks_with_threads = 0;
// Loop through all benchmarks
for (const BenchmarkInstance& benchmark : benchmarks) {
BenchmarkReporter::PerFamilyRunReports* reports_for_family = nullptr;
if (benchmark.complexity() != oNone)
reports_for_family = &per_family_reports[benchmark.family_index()];
runners.emplace_back(benchmark, reports_for_family);
benchmarks_with_threads += (benchmark.threads() > 0);
runners.emplace_back(benchmark, &perfcounters, reports_for_family);
int num_repeats_of_this_instance = runners.back().GetNumRepeats();
num_repetitions_total += num_repeats_of_this_instance;
if (reports_for_family)
@ -363,6 +375,17 @@ void RunBenchmarks(const std::vector<BenchmarkInstance>& benchmarks,
}
assert(runners.size() == benchmarks.size() && "Unexpected runner count.");
// The use of performance counters with threads would be unintuitive for
// the average user so we need to warn them about this case
if ((benchmarks_with_threads > 0) && (perfcounters.num_counters() > 0)) {
GetErrorLogInstance()
<< "***WARNING*** There are " << benchmarks_with_threads
<< " benchmarks with threads and " << perfcounters.num_counters()
<< " performance counters were requested. Beware counters will "
"reflect the combined usage across all "
"threads.\n";
}
std::vector<size_t> repetition_indices;
repetition_indices.reserve(num_repetitions_total);
for (size_t runner_index = 0, num_runners = runners.size();

View File

@ -221,6 +221,7 @@ BenchTimeType ParseBenchMinTime(const std::string& value) {
BenchmarkRunner::BenchmarkRunner(
const benchmark::internal::BenchmarkInstance& b_,
PerfCountersMeasurement* pcm_,
BenchmarkReporter::PerFamilyRunReports* reports_for_family_)
: b(b_),
reports_for_family(reports_for_family_),
@ -239,10 +240,7 @@ BenchmarkRunner::BenchmarkRunner(
iters(has_explicit_iteration_count
? ComputeIters(b_, parsed_benchtime_flag)
: 1),
perf_counters_measurement(StrSplit(FLAGS_benchmark_perf_counters, ',')),
perf_counters_measurement_ptr(perf_counters_measurement.IsValid()
? &perf_counters_measurement
: nullptr) {
perf_counters_measurement_ptr(pcm_) {
run_results.display_report_aggregates_only =
(FLAGS_benchmark_report_aggregates_only ||
FLAGS_benchmark_display_aggregates_only);
@ -255,7 +253,7 @@ BenchmarkRunner::BenchmarkRunner(
run_results.file_report_aggregates_only =
(b.aggregation_report_mode() & internal::ARM_FileReportAggregatesOnly);
BM_CHECK(FLAGS_benchmark_perf_counters.empty() ||
perf_counters_measurement.IsValid())
(perf_counters_measurement_ptr->num_counters() == 0))
<< "Perf counters were requested but could not be set up.";
}
}

View File

@ -58,6 +58,7 @@ BenchTimeType ParseBenchMinTime(const std::string& value);
class BenchmarkRunner {
public:
BenchmarkRunner(const benchmark::internal::BenchmarkInstance& b_,
benchmark::internal::PerfCountersMeasurement* pmc_,
BenchmarkReporter::PerFamilyRunReports* reports_for_family);
int GetNumRepeats() const { return repeats; }
@ -103,8 +104,7 @@ class BenchmarkRunner {
// So only the first repetition has to find/calculate it,
// the other repetitions will just use that precomputed iteration count.
PerfCountersMeasurement perf_counters_measurement;
PerfCountersMeasurement* const perf_counters_measurement_ptr;
PerfCountersMeasurement* const perf_counters_measurement_ptr = nullptr;
struct IterationResults {
internal::ThreadManager::Result results;

View File

@ -71,80 +71,78 @@ bool PerfCounters::IsCounterSupported(const std::string& name) {
return (ret == PFM_SUCCESS);
}
// Validates all counter names passed, returning only the valid ones
static std::vector<std::string> validateCounters(
const std::vector<std::string>& counter_names) {
// All valid names to be returned
std::vector<std::string> valid_names;
// Loop through all the given names
int invalid_counter = 0;
for (const std::string& name : counter_names) {
// Check trivial empty
if (name.empty()) {
GetErrorLogInstance() << "A counter name was the empty string\n";
invalid_counter++;
continue;
}
if (PerfCounters::IsCounterSupported(name)) {
// we are about to push into the valid names vector
// check if we did not reach the maximum
if (valid_names.size() == PerfCounterValues::kMaxCounters) {
GetErrorLogInstance()
<< counter_names.size()
<< " counters were requested. The maximum is "
<< PerfCounterValues::kMaxCounters << " and "
<< counter_names.size() - invalid_counter - valid_names.size()
<< " will be ignored\n";
// stop the loop and return what we have already
break;
}
valid_names.push_back(name);
} else {
GetErrorLogInstance() << "Performance counter " << name
<< " incorrect or not supported on this platform\n";
invalid_counter++;
}
}
// RVO should take care of this
return valid_names;
}
PerfCounters PerfCounters::Create(
const std::vector<std::string>& counter_names) {
std::vector<std::string> valid_names = validateCounters(counter_names);
if (valid_names.empty()) {
return NoCounters();
}
std::vector<int> counter_ids(valid_names.size());
// Valid counters will populate these arrays but we start empty
std::vector<std::string> valid_names;
std::vector<int> counter_ids;
std::vector<int> leader_ids;
const int mode = PFM_PLM3; // user mode only
// Resize to the maximum possible
valid_names.reserve(counter_names.size());
counter_ids.reserve(counter_names.size());
const int kCounterMode = PFM_PLM3; // user mode only
// Group leads will be assigned on demand. The idea is that once we cannot
// create a counter descriptor, the reason is that this group has maxed out
// so we set the group_id again to -1 and retry - giving the algorithm a
// chance to create a new group leader to hold the next set of counters.
int group_id = -1;
for (size_t i = 0; i < valid_names.size(); ++i) {
// Loop through all performance counters
for (size_t i = 0; i < counter_names.size(); ++i) {
// we are about to push into the valid names vector
// check if we did not reach the maximum
if (valid_names.size() == PerfCounterValues::kMaxCounters) {
// Log a message if we maxed out and stop adding
GetErrorLogInstance()
<< counter_names.size() << " counters were requested. The maximum is "
<< PerfCounterValues::kMaxCounters << " and " << valid_names.size()
<< " were already added. All remaining counters will be ignored\n";
// stop the loop and return what we have already
break;
}
// Check if this name is empty
const auto& name = counter_names[i];
if (name.empty()) {
GetErrorLogInstance()
<< "A performance counter name was the empty string\n";
continue;
}
// Here first means first in group, ie the group leader
const bool is_first = (group_id < 0);
// This struct will be populated by libpfm from the counter string
// and then fed into the syscall perf_event_open
struct perf_event_attr attr {};
attr.size = sizeof(attr);
const auto& name = valid_names[i];
// This is the input struct to libpfm.
pfm_perf_encode_arg_t arg{};
arg.attr = &attr;
const int pfm_get =
pfm_get_os_event_encoding(name.c_str(), mode, PFM_OS_PERF_EVENT, &arg);
const int pfm_get = pfm_get_os_event_encoding(name.c_str(), kCounterMode,
PFM_OS_PERF_EVENT, &arg);
if (pfm_get != PFM_SUCCESS) {
GetErrorLogInstance() << "Unknown counter name: " << name << "\n";
return NoCounters();
GetErrorLogInstance()
<< "Unknown performance counter name: " << name << "\n";
continue;
}
attr.disabled = is_first;
// We then proceed to populate the remaining fields in our attribute struct
// Note: the man page for perf_event_create suggests inherit = true and
// read_format = PERF_FORMAT_GROUP don't work together, but that's not the
// case.
attr.disabled = is_first;
attr.inherit = true;
attr.pinned = is_first;
attr.exclude_kernel = true;
attr.exclude_user = false;
attr.exclude_hv = true;
// Read all counters in one read.
// Read all counters in a group in one read.
attr.read_format = PERF_FORMAT_GROUP;
int id = -1;
@ -159,36 +157,64 @@ PerfCounters PerfCounters::Create(
}
}
if (id < 0) {
// We reached a limit perhaps?
// If the file descriptor is negative we might have reached a limit
// in the current group. Set the group_id to -1 and retry
if (group_id >= 0) {
// Create a new group
group_id = -1;
} else {
// Give up, there is nothing else to try
// At this point we have already retried to set a new group id and
// failed. We then give up.
break;
}
}
}
// We failed to get a new file descriptor. We might have reached a hard
// hardware limit that cannot be resolved even with group multiplexing
if (id < 0) {
GetErrorLogInstance()
<< "Failed to get a file descriptor for " << name << "\n";
return NoCounters();
GetErrorLogInstance() << "***WARNING** Failed to get a file descriptor "
"for performance counter "
<< name << ". Ignoring\n";
// We give up on this counter but try to keep going
// as the others would be fine
continue;
}
if (group_id < 0) {
// This is a leader, store and assign it
// This is a leader, store and assign it to the current file descriptor
leader_ids.push_back(id);
group_id = id;
}
counter_ids[i] = id;
// This is a valid counter, add it to our descriptor's list
counter_ids.push_back(id);
valid_names.push_back(name);
}
// Loop through all group leaders activating them
// There is another option of starting ALL counters in a process but
// that would be far reaching an intrusion. If the user is using PMCs
// by themselves then this would have a side effect on them. It is
// friendlier to loop through all groups individually.
for (int lead : leader_ids) {
if (ioctl(lead, PERF_EVENT_IOC_ENABLE) != 0) {
GetErrorLogInstance() << "Failed to start counters\n";
// This should never happen but if it does, we give up on the
// entire batch as recovery would be a mess.
GetErrorLogInstance() << "***WARNING*** Failed to start counters. "
"Claring out all counters.\n";
// Close all peformance counters
for (int id : counter_ids) {
::close(id);
}
// Return an empty object so our internal state is still good and
// the process can continue normally without impact
return NoCounters();
}
}
return PerfCounters(valid_names, std::move(counter_ids),
return PerfCounters(std::move(valid_names), std::move(counter_ids),
std::move(leader_ids));
}
@ -223,34 +249,10 @@ PerfCounters PerfCounters::Create(
void PerfCounters::CloseCounters() const {}
#endif // defined HAVE_LIBPFM
Mutex PerfCountersMeasurement::mutex_;
int PerfCountersMeasurement::ref_count_ = 0;
PerfCounters PerfCountersMeasurement::counters_ = PerfCounters::NoCounters();
// The validation in PerfCounter::Create will create less counters than passed
// so it should be okay to initialize start_values_ and end_values_ with the
// upper bound as passed
PerfCountersMeasurement::PerfCountersMeasurement(
const std::vector<std::string>& counter_names)
: start_values_(counter_names.size()), end_values_(counter_names.size()) {
MutexLock l(mutex_);
if (ref_count_ == 0) {
counters_ = PerfCounters::Create(counter_names);
}
// We chose to increment it even if `counters_` ends up invalid,
// so that we don't keep trying to create, and also since the dtor
// will decrement regardless of `counters_`'s validity
++ref_count_;
BM_CHECK(!counters_.IsValid() || counters_.names() == counter_names);
}
PerfCountersMeasurement::~PerfCountersMeasurement() {
MutexLock l(mutex_);
--ref_count_;
if (ref_count_ == 0) {
counters_ = PerfCounters::NoCounters();
}
counters_ = PerfCounters::Create(counter_names);
}
PerfCounters& PerfCounters::operator=(PerfCounters&& other) noexcept {

View File

@ -90,10 +90,11 @@ class BENCHMARK_EXPORT PerfCounters final {
// True iff this platform supports performance counters.
static const bool kSupported;
bool IsValid() const { return !counter_names_.empty(); }
// Returns an empty object
static PerfCounters NoCounters() { return PerfCounters(); }
~PerfCounters() { CloseCounters(); }
PerfCounters() = default;
PerfCounters(PerfCounters&&) = default;
PerfCounters(const PerfCounters&) = delete;
PerfCounters& operator=(PerfCounters&&) noexcept;
@ -110,8 +111,8 @@ class BENCHMARK_EXPORT PerfCounters final {
// Return a PerfCounters object ready to read the counters with the names
// specified. The values are user-mode only. The counter name format is
// implementation and OS specific.
// TODO: once we move to C++-17, this should be a std::optional, and then the
// IsValid() boolean can be dropped.
// In case of failure, this method will in the worst case return an
// empty object whose state will still be valid.
static PerfCounters Create(const std::vector<std::string>& counter_names);
// Take a snapshot of the current value of the counters into the provided
@ -120,7 +121,6 @@ class BENCHMARK_EXPORT PerfCounters final {
BENCHMARK_ALWAYS_INLINE bool Snapshot(PerfCounterValues* values) const {
#ifndef BENCHMARK_OS_WINDOWS
assert(values != nullptr);
assert(IsValid());
return values->Read(leader_ids_) == counter_ids_.size();
#else
(void)values;
@ -137,7 +137,6 @@ class BENCHMARK_EXPORT PerfCounters final {
: counter_ids_(std::move(counter_ids)),
leader_ids_(std::move(leader_ids)),
counter_names_(counter_names) {}
PerfCounters() = default;
void CloseCounters() const;
@ -150,33 +149,25 @@ class BENCHMARK_EXPORT PerfCounters final {
class BENCHMARK_EXPORT PerfCountersMeasurement final {
public:
PerfCountersMeasurement(const std::vector<std::string>& counter_names);
~PerfCountersMeasurement();
// The only way to get to `counters_` is after ctor-ing a
// `PerfCountersMeasurement`, which means that `counters_`'s state is, here,
// decided (either invalid or valid) and won't change again even if a ctor is
// concurrently running with this. This is preferring efficiency to
// maintainability, because the address of the static can be known at compile
// time.
bool IsValid() const {
MutexLock l(mutex_);
return counters_.IsValid();
}
size_t num_counters() const { return counters_.num_counters(); }
BENCHMARK_ALWAYS_INLINE void Start() {
assert(IsValid());
MutexLock l(mutex_);
std::vector<std::string> names() const { return counters_.names(); }
BENCHMARK_ALWAYS_INLINE bool Start() {
if (num_counters() == 0) return true;
// Tell the compiler to not move instructions above/below where we take
// the snapshot.
ClobberMemory();
valid_read_ &= counters_.Snapshot(&start_values_);
ClobberMemory();
return valid_read_;
}
BENCHMARK_ALWAYS_INLINE bool Stop(
std::vector<std::pair<std::string, double>>& measurements) {
assert(IsValid());
MutexLock l(mutex_);
if (num_counters() == 0) return true;
// Tell the compiler to not move instructions above/below where we take
// the snapshot.
ClobberMemory();
@ -193,9 +184,7 @@ class BENCHMARK_EXPORT PerfCountersMeasurement final {
}
private:
static Mutex mutex_;
GUARDED_BY(mutex_) static int ref_count_;
GUARDED_BY(mutex_) static PerfCounters counters_;
PerfCounters counters_;
bool valid_read_ = true;
PerfCounterValues start_values_;
PerfCounterValues end_values_;

View File

@ -1,3 +1,4 @@
#include <random>
#include <thread>
#include "../src/perf_counters.h"
@ -28,7 +29,7 @@ TEST(PerfCountersTest, OneCounter) {
GTEST_SKIP() << "Performance counters not supported.\n";
}
EXPECT_TRUE(PerfCounters::Initialize());
EXPECT_TRUE(PerfCounters::Create({kGenericPerfEvent1}).IsValid());
EXPECT_EQ(PerfCounters::Create({kGenericPerfEvent1}).num_counters(), 1);
}
TEST(PerfCountersTest, NegativeTest) {
@ -37,38 +38,42 @@ TEST(PerfCountersTest, NegativeTest) {
return;
}
EXPECT_TRUE(PerfCounters::Initialize());
EXPECT_FALSE(PerfCounters::Create({}).IsValid());
EXPECT_FALSE(PerfCounters::Create({""}).IsValid());
EXPECT_FALSE(PerfCounters::Create({"not a counter name"}).IsValid());
EXPECT_TRUE(PerfCounters::Create(
{kGenericPerfEvent1, kGenericPerfEvent2, kGenericPerfEvent3})
.IsValid());
// Sanity checks
// Create() will always create a valid object, even if passed no or
// wrong arguments as the new behavior is to warn and drop unsupported
// counters
EXPECT_EQ(PerfCounters::Create({}).num_counters(), 0);
EXPECT_EQ(PerfCounters::Create({""}).num_counters(), 0);
EXPECT_EQ(PerfCounters::Create({"not a counter name"}).num_counters(), 0);
{
// Try sneaking in a bad egg to see if it is filtered out. The
// number of counters has to be two, not zero
auto counter =
PerfCounters::Create({kGenericPerfEvent2, "", kGenericPerfEvent1});
EXPECT_TRUE(counter.IsValid());
EXPECT_EQ(counter.num_counters(), 2);
EXPECT_EQ(counter.names(), std::vector<std::string>(
{kGenericPerfEvent2, kGenericPerfEvent1}));
}
{
// Try sneaking in an outrageous counter, like a fat finger mistake
auto counter = PerfCounters::Create(
{kGenericPerfEvent3, "not a counter name", kGenericPerfEvent1});
EXPECT_TRUE(counter.IsValid());
EXPECT_EQ(counter.num_counters(), 2);
EXPECT_EQ(counter.names(), std::vector<std::string>(
{kGenericPerfEvent3, kGenericPerfEvent1}));
}
{
EXPECT_TRUE(PerfCounters::Create({kGenericPerfEvent1, kGenericPerfEvent2,
kGenericPerfEvent3})
.IsValid());
// Finally try a golden input - it should like all them
EXPECT_EQ(PerfCounters::Create(
{kGenericPerfEvent1, kGenericPerfEvent2, kGenericPerfEvent3})
.num_counters(),
3);
}
{
// Add a bad apple in the end of the chain to check the edges
auto counter = PerfCounters::Create({kGenericPerfEvent1, kGenericPerfEvent2,
kGenericPerfEvent3,
"MISPREDICTED_BRANCH_RETIRED"});
EXPECT_TRUE(counter.IsValid());
EXPECT_EQ(counter.num_counters(), 3);
EXPECT_EQ(counter.names(),
std::vector<std::string>({kGenericPerfEvent1, kGenericPerfEvent2,
@ -82,7 +87,7 @@ TEST(PerfCountersTest, Read1Counter) {
}
EXPECT_TRUE(PerfCounters::Initialize());
auto counters = PerfCounters::Create({kGenericPerfEvent1});
EXPECT_TRUE(counters.IsValid());
EXPECT_EQ(counters.num_counters(), 1);
PerfCounterValues values1(1);
EXPECT_TRUE(counters.Snapshot(&values1));
EXPECT_GT(values1[0], 0);
@ -99,7 +104,7 @@ TEST(PerfCountersTest, Read2Counters) {
EXPECT_TRUE(PerfCounters::Initialize());
auto counters =
PerfCounters::Create({kGenericPerfEvent1, kGenericPerfEvent2});
EXPECT_TRUE(counters.IsValid());
EXPECT_EQ(counters.num_counters(), 2);
PerfCounterValues values1(2);
EXPECT_TRUE(counters.Snapshot(&values1));
EXPECT_GT(values1[0], 0);
@ -111,62 +116,107 @@ TEST(PerfCountersTest, Read2Counters) {
}
TEST(PerfCountersTest, ReopenExistingCounters) {
// The test works (i.e. causes read to fail) for the assumptions
// about hardware capabilities (i.e. small number (3-4) hardware
// counters) at this date.
// This test works in recent and old Intel hardware
// However we cannot make assumptions beyond 3 HW counters
if (!PerfCounters::kSupported) {
GTEST_SKIP() << "Test skipped because libpfm is not supported.\n";
}
EXPECT_TRUE(PerfCounters::Initialize());
std::vector<PerfCounters> counters;
counters.reserve(6);
for (int i = 0; i < 6; i++)
counters.push_back(PerfCounters::Create({kGenericPerfEvent1}));
std::vector<std::string> kMetrics({kGenericPerfEvent1});
std::vector<PerfCounters> counters(3);
for (auto& counter : counters) {
counter = PerfCounters::Create(kMetrics);
}
PerfCounterValues values(1);
EXPECT_TRUE(counters[0].Snapshot(&values));
EXPECT_FALSE(counters[4].Snapshot(&values));
EXPECT_FALSE(counters[5].Snapshot(&values));
EXPECT_TRUE(counters[1].Snapshot(&values));
EXPECT_TRUE(counters[2].Snapshot(&values));
}
TEST(PerfCountersTest, CreateExistingMeasurements) {
// The test works (i.e. causes read to fail) for the assumptions
// about hardware capabilities (i.e. small number (3-4) hardware
// about hardware capabilities (i.e. small number (3) hardware
// counters) at this date,
// the same as previous test ReopenExistingCounters.
if (!PerfCounters::kSupported) {
GTEST_SKIP() << "Test skipped because libpfm is not supported.\n";
}
EXPECT_TRUE(PerfCounters::Initialize());
std::vector<PerfCountersMeasurement> perf_counter_measurements;
// This means we will try 10 counters but we can only guarantee
// for sure at this time that only 3 will work. Perhaps in the future
// we could use libpfm to query for the hardware limits on this
// particular platform.
const int kMaxCounters = 10;
const int kMinValidCounters = 3;
// Let's use a ubiquitous counter that is guaranteed to work
// on all platforms
const std::vector<std::string> kMetrics{"cycles"};
// Cannot create a vector of actual objects because the
// copy constructor of PerfCounters is deleted - and so is
// implicitly deleted on PerfCountersMeasurement too
std::vector<std::unique_ptr<PerfCountersMeasurement>>
perf_counter_measurements;
perf_counter_measurements.reserve(kMaxCounters);
for (int j = 0; j < kMaxCounters; ++j) {
perf_counter_measurements.emplace_back(
new PerfCountersMeasurement(kMetrics));
}
std::vector<std::pair<std::string, double>> measurements;
perf_counter_measurements.reserve(10);
for (int i = 0; i < 10; i++)
perf_counter_measurements.emplace_back(
std::vector<std::string>{kGenericPerfEvent1});
// Start all counters together to see if they hold
int max_counters = kMaxCounters;
for (int i = 0; i < kMaxCounters; ++i) {
auto& counter(*perf_counter_measurements[i]);
EXPECT_EQ(counter.num_counters(), 1);
if (!counter.Start()) {
max_counters = i;
break;
};
}
perf_counter_measurements[0].Start();
EXPECT_TRUE(perf_counter_measurements[0].Stop(measurements));
ASSERT_GE(max_counters, kMinValidCounters);
measurements.clear();
perf_counter_measurements[8].Start();
EXPECT_FALSE(perf_counter_measurements[8].Stop(measurements));
// Start all together
for (int i = 0; i < max_counters; ++i) {
auto& counter(*perf_counter_measurements[i]);
EXPECT_TRUE(counter.Stop(measurements) || (i >= kMinValidCounters));
}
measurements.clear();
perf_counter_measurements[9].Start();
EXPECT_FALSE(perf_counter_measurements[9].Stop(measurements));
// Start/stop individually
for (int i = 0; i < max_counters; ++i) {
auto& counter(*perf_counter_measurements[i]);
measurements.clear();
counter.Start();
EXPECT_TRUE(counter.Stop(measurements) || (i >= kMinValidCounters));
}
}
size_t do_work() {
size_t res = 0;
for (size_t i = 0; i < 100000000; ++i) res += i * i;
return res;
// We try to do some meaningful work here but the compiler
// insists in optimizing away our loop so we had to add a
// no-optimize macro. In case it fails, we added some entropy
// to this pool as well.
BENCHMARK_DONT_OPTIMIZE size_t do_work() {
static std::mt19937 rd{std::random_device{}()};
static std::uniform_int_distribution<size_t> mrand(0, 10);
const size_t kNumLoops = 1000000;
size_t sum = 0;
for (size_t j = 0; j < kNumLoops; ++j) {
sum += mrand(rd);
}
benchmark::DoNotOptimize(sum);
return sum;
}
void measure(size_t threadcount, PerfCounterValues* values1,
PerfCounterValues* values2) {
BM_CHECK_NE(values1, nullptr);
BM_CHECK_NE(values2, nullptr);
void measure(size_t threadcount, PerfCounterValues* before,
PerfCounterValues* after) {
BM_CHECK_NE(before, nullptr);
BM_CHECK_NE(after, nullptr);
std::vector<std::thread> threads(threadcount);
auto work = [&]() { BM_CHECK(do_work() > 1000); };
@ -178,9 +228,9 @@ void measure(size_t threadcount, PerfCounterValues* values1,
auto counters =
PerfCounters::Create({kGenericPerfEvent1, kGenericPerfEvent3});
for (auto& t : threads) t = std::thread(work);
counters.Snapshot(values1);
counters.Snapshot(before);
for (auto& t : threads) t.join();
counters.Snapshot(values2);
counters.Snapshot(after);
}
TEST(PerfCountersTest, MultiThreaded) {
@ -188,21 +238,29 @@ TEST(PerfCountersTest, MultiThreaded) {
GTEST_SKIP() << "Test skipped because libpfm is not supported.";
}
EXPECT_TRUE(PerfCounters::Initialize());
PerfCounterValues values1(2);
PerfCounterValues values2(2);
PerfCounterValues before(2);
PerfCounterValues after(2);
measure(2, &values1, &values2);
std::vector<double> D1{static_cast<double>(values2[0] - values1[0]),
static_cast<double>(values2[1] - values1[1])};
// Notice that this test will work even if we taskset it to a single CPU
// In this case the threads will run sequentially
// Start two threads and measure the number of combined cycles and
// instructions
measure(2, &before, &after);
std::vector<double> Elapsed2Threads{
static_cast<double>(after[0] - before[0]),
static_cast<double>(after[1] - before[1])};
measure(4, &values1, &values2);
std::vector<double> D2{static_cast<double>(values2[0] - values1[0]),
static_cast<double>(values2[1] - values1[1])};
// Start four threads and measure the number of combined cycles and
// instructions
measure(4, &before, &after);
std::vector<double> Elapsed4Threads{
static_cast<double>(after[0] - before[0]),
static_cast<double>(after[1] - before[1])};
// Some extra work will happen on the main thread - like joining the threads
// - so the ratio won't be quite 2.0, but very close.
EXPECT_GE(D2[0], 1.9 * D1[0]);
EXPECT_GE(D2[1], 1.9 * D1[1]);
EXPECT_GE(Elapsed4Threads[0], 1.9 * Elapsed2Threads[0]);
EXPECT_GE(Elapsed4Threads[1], 1.9 * Elapsed2Threads[1]);
}
TEST(PerfCountersTest, HardwareLimits) {