Add process_time for better OpenMP and user-managed thread timing

* Google Benchmark now works with OpenMP and other user-managed threading.
This commit is contained in:
Bryan Lunt 2019-04-09 07:01:33 -05:00 committed by Dominic Hamon
parent 30bd6ea7f8
commit 7a1c370283
8 changed files with 290 additions and 11 deletions

View File

@ -428,6 +428,49 @@ BENCHMARK(BM_test)->Range(8, 8<<10)->UseRealTime();
Without `UseRealTime`, CPU time is used by default.
## CPU timers
By default, the CPU timer only measures the time spent by the main thread.
If the benchmark itself uses threads internally, this measurement may not
be what you are looking for. Instead, there is a way to measure the total
CPU usage of the process, by all the threads.
```c++
void callee(int i);
static void MyMain(int size) {
#pragma omp parallel for
for(int i = 0; i < size; i++)
callee(i);
}
static void BM_OpenMP(benchmark::State& state) {
for (auto _ : state)
MyMain(state.range(0);
}
// Measure the time spent by the main thread, use it to decide for how long to
// run the benchmark loop. Depending on the internal implementation detail may
// measure to anywhere from near-zero (the overhead spent before/after work
// handoff to worker thread[s]) to the whole single-thread time.
BENCHMARK(BM_OpenMP)->Range(8, 8<<10);
// Measure the user-visible time, the wall clock (literally, the time that
// has passed on the clock on the wall), use it to decide for how long to
// run the benchmark loop. This will always be meaningful, an will match the
// time spent by the main thread in single-threaded case, in general decreasing
// with the number of internal threads doing the work.
BENCHMARK(BM_OpenMP)->Range(8, 8<<10)->UseRealTime();
// Measure the total CPU consumption, use it to decide for how long to
// run the benchmark loop. This will always measure to no less than the
// time spent by the main thread in single-threaded case.
BENCHMARK(BM_OpenMP)->Range(8, 8<<10)->MeasureProcessCPUTime();
// A mixture of the last two. Measure the total CPU consumption, but use the
// wall clock to decide for how long to run the benchmark loop.
BENCHMARK(BM_OpenMP)->Range(8, 8<<10)->MeasureProcessCPUTime()->UseRealTime();
```
## Controlling timers
Normally, the entire duration of the work loop (`for (auto _ : state) {}`)
is measured. But sometimes, it is nessesary to do some work inside of

View File

@ -874,11 +874,18 @@ class Benchmark {
// Same as ReportAggregatesOnly(), but applies to display reporter only.
Benchmark* DisplayAggregatesOnly(bool value = true);
// If a particular benchmark is I/O bound, runs multiple threads internally or
// if for some reason CPU timings are not representative, call this method. If
// called, the elapsed time will be used to control how many iterations are
// run, and in the printing of items/second or MB/seconds values. If not
// called, the cpu time used by the benchmark will be used.
// By default, the CPU time is measured only for the main thread, which may
// be unrepresentative if the benchmark uses threads internally. If called,
// the total CPU time spent by all the threads will be measured instead.
// By default, the only the main thread CPU time will be measured.
Benchmark* MeasureProcessCPUTime();
// If a particular benchmark should use the Wall clock instead of the CPU time
// (be it either the CPU time of the main thread only (default), or the
// total CPU usage of the benchmark), call this method. If called, the elapsed
// (wall) time will be used to control how many iterations are run, and in the
// printing of items/second or MB/seconds values.
// If not called, the CPU time used by the benchmark will be used.
Benchmark* UseRealTime();
// If a benchmark must measure time manually (e.g. if GPU execution time is
@ -952,6 +959,7 @@ class Benchmark {
double min_time_;
size_t iterations_;
int repetitions_;
bool measure_process_cpu_time_;
bool use_real_time_;
bool use_manual_time_;
BigO complexity_;

View File

@ -22,6 +22,7 @@ struct BenchmarkInstance {
std::vector<int64_t> arg;
TimeUnit time_unit;
int range_multiplier;
bool measure_process_cpu_time;
bool use_real_time;
bool use_manual_time;
BigO complexity;

View File

@ -165,6 +165,7 @@ bool BenchmarkFamilies::FindBenchmarks(
instance.min_time = family->min_time_;
instance.iterations = family->iterations_;
instance.repetitions = family->repetitions_;
instance.measure_process_cpu_time = family->measure_process_cpu_time_;
instance.use_real_time = family->use_real_time_;
instance.use_manual_time = family->use_manual_time_;
instance.complexity = family->complexity_;
@ -202,10 +203,20 @@ bool BenchmarkFamilies::FindBenchmarks(
instance.name.repetitions =
StrFormat("repeats:%d", family->repetitions_);
if (family->measure_process_cpu_time_) {
instance.name.time_type = "process_time";
}
if (family->use_manual_time_) {
instance.name.time_type = "manual_time";
if (!instance.name.time_type.empty()) {
instance.name.time_type += '/';
}
instance.name.time_type += "manual_time";
} else if (family->use_real_time_) {
instance.name.time_type = "real_time";
if (!instance.name.time_type.empty()) {
instance.name.time_type += '/';
}
instance.name.time_type += "real_time";
}
// Add the number of threads used to the name
@ -252,6 +263,7 @@ Benchmark::Benchmark(const char* name)
min_time_(0),
iterations_(0),
repetitions_(0),
measure_process_cpu_time_(false),
use_real_time_(false),
use_manual_time_(false),
complexity_(oNone),
@ -398,6 +410,12 @@ Benchmark* Benchmark::DisplayAggregatesOnly(bool value) {
return this;
}
Benchmark* Benchmark::MeasureProcessCPUTime() {
// Can be used together with UseRealTime() / UseManualTime().
measure_process_cpu_time_ = true;
return this;
}
Benchmark* Benchmark::UseRealTime() {
CHECK(!use_manual_time_)
<< "Cannot set UseRealTime and UseManualTime simultaneously.";

View File

@ -111,7 +111,10 @@ BenchmarkReporter::Run CreateRunReport(
// Adds the stats collected for the thread into *total.
void RunInThread(const BenchmarkInstance* b, size_t iters, int thread_id,
ThreadManager* manager) {
internal::ThreadTimer timer;
internal::ThreadTimer timer(
b->measure_process_cpu_time
? internal::ThreadTimer::CreateProcessCpuTime()
: internal::ThreadTimer::Create());
State st = b->Run(iters, thread_id, &timer, manager);
CHECK(st.iterations() >= st.max_iterations)
<< "Benchmark returned before State::KeepRunning() returned false!";
@ -226,6 +229,8 @@ class BenchmarkRunner {
// Adjust real/manual time stats since they were reported per thread.
i.results.real_time_used /= b.threads;
i.results.manual_time_used /= b.threads;
// If we were measuring whole-process CPU usage, adjust the CPU time too.
if (b.measure_process_cpu_time) i.results.cpu_time_used /= b.threads;
VLOG(2) << "Ran in " << i.results.cpu_time_used << "/"
<< i.results.real_time_used << "\n";

View File

@ -8,14 +8,22 @@ namespace benchmark {
namespace internal {
class ThreadTimer {
explicit ThreadTimer(bool measure_process_cpu_time_)
: measure_process_cpu_time(measure_process_cpu_time_) {}
public:
ThreadTimer() = default;
static ThreadTimer Create() {
return ThreadTimer(/*measure_process_cpu_time_=*/false);
}
static ThreadTimer CreateProcessCpuTime() {
return ThreadTimer(/*measure_process_cpu_time_=*/true);
}
// Called by each thread
void StartTimer() {
running_ = true;
start_real_time_ = ChronoClockNow();
start_cpu_time_ = ThreadCPUUsage();
start_cpu_time_ = ReadCpuTimerOfChoice();
}
// Called by each thread
@ -25,7 +33,8 @@ class ThreadTimer {
real_time_used_ += ChronoClockNow() - start_real_time_;
// Floating point error can result in the subtraction producing a negative
// time. Guard against that.
cpu_time_used_ += std::max<double>(ThreadCPUUsage() - start_cpu_time_, 0);
cpu_time_used_ +=
std::max<double>(ReadCpuTimerOfChoice() - start_cpu_time_, 0);
}
// Called by each thread
@ -52,6 +61,14 @@ class ThreadTimer {
}
private:
double ReadCpuTimerOfChoice() const {
if (measure_process_cpu_time) return ProcessCPUUsage();
return ThreadCPUUsage();
}
// should the thread, or the process, time be measured?
const bool measure_process_cpu_time;
bool running_ = false; // Is the timer running
double start_real_time_ = 0; // If running_
double start_cpu_time_ = 0; // If running_

View File

@ -125,6 +125,9 @@ add_test(templated_fixture_test templated_fixture_test --benchmark_min_time=0.01
compile_output_test(user_counters_test)
add_test(user_counters_test user_counters_test --benchmark_min_time=0.01)
compile_output_test(internal_threading_test)
add_test(internal_threading_test internal_threading_test --benchmark_min_time=0.01)
compile_output_test(report_aggregates_only_test)
add_test(report_aggregates_only_test report_aggregates_only_test --benchmark_min_time=0.01)

View File

@ -0,0 +1,184 @@
#undef NDEBUG
#include <chrono>
#include <thread>
#include "../src/timers.h"
#include "benchmark/benchmark.h"
#include "output_test.h"
static const std::chrono::duration<double, std::milli> time_frame(50);
static const double time_frame_in_sec(
std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1, 1>>>(
time_frame)
.count());
void MyBusySpinwait() {
const auto start = benchmark::ChronoClockNow();
while (true) {
const auto now = benchmark::ChronoClockNow();
const auto elapsed = now - start;
if (std::chrono::duration<double, std::chrono::seconds::period>(elapsed) >=
time_frame)
return;
}
}
// ========================================================================= //
// --------------------------- TEST CASES BEGIN ---------------------------- //
// ========================================================================= //
// ========================================================================= //
// BM_MainThread
void BM_MainThread(benchmark::State& state) {
for (auto _ : state) {
MyBusySpinwait();
state.SetIterationTime(time_frame_in_sec);
}
state.counters["invtime"] =
benchmark::Counter{1, benchmark::Counter::kIsRate};
}
BENCHMARK(BM_MainThread)->Iterations(1)->Threads(1);
BENCHMARK(BM_MainThread)->Iterations(1)->Threads(1)->UseRealTime();
BENCHMARK(BM_MainThread)->Iterations(1)->Threads(1)->UseManualTime();
BENCHMARK(BM_MainThread)->Iterations(1)->Threads(1)->MeasureProcessCPUTime();
BENCHMARK(BM_MainThread)
->Iterations(1)
->Threads(1)
->MeasureProcessCPUTime()
->UseRealTime();
BENCHMARK(BM_MainThread)
->Iterations(1)
->Threads(1)
->MeasureProcessCPUTime()
->UseManualTime();
BENCHMARK(BM_MainThread)->Iterations(1)->Threads(2);
BENCHMARK(BM_MainThread)->Iterations(1)->Threads(2)->UseRealTime();
BENCHMARK(BM_MainThread)->Iterations(1)->Threads(2)->UseManualTime();
BENCHMARK(BM_MainThread)->Iterations(1)->Threads(2)->MeasureProcessCPUTime();
BENCHMARK(BM_MainThread)
->Iterations(1)
->Threads(2)
->MeasureProcessCPUTime()
->UseRealTime();
BENCHMARK(BM_MainThread)
->Iterations(1)
->Threads(2)
->MeasureProcessCPUTime()
->UseManualTime();
// ========================================================================= //
// BM_WorkerThread
void BM_WorkerThread(benchmark::State& state) {
for (auto _ : state) {
std::thread Worker(&MyBusySpinwait);
Worker.join();
state.SetIterationTime(time_frame_in_sec);
}
state.counters["invtime"] =
benchmark::Counter{1, benchmark::Counter::kIsRate};
}
BENCHMARK(BM_WorkerThread)->Iterations(1)->Threads(1);
BENCHMARK(BM_WorkerThread)->Iterations(1)->Threads(1)->UseRealTime();
BENCHMARK(BM_WorkerThread)->Iterations(1)->Threads(1)->UseManualTime();
BENCHMARK(BM_WorkerThread)->Iterations(1)->Threads(1)->MeasureProcessCPUTime();
BENCHMARK(BM_WorkerThread)
->Iterations(1)
->Threads(1)
->MeasureProcessCPUTime()
->UseRealTime();
BENCHMARK(BM_WorkerThread)
->Iterations(1)
->Threads(1)
->MeasureProcessCPUTime()
->UseManualTime();
BENCHMARK(BM_WorkerThread)->Iterations(1)->Threads(2);
BENCHMARK(BM_WorkerThread)->Iterations(1)->Threads(2)->UseRealTime();
BENCHMARK(BM_WorkerThread)->Iterations(1)->Threads(2)->UseManualTime();
BENCHMARK(BM_WorkerThread)->Iterations(1)->Threads(2)->MeasureProcessCPUTime();
BENCHMARK(BM_WorkerThread)
->Iterations(1)
->Threads(2)
->MeasureProcessCPUTime()
->UseRealTime();
BENCHMARK(BM_WorkerThread)
->Iterations(1)
->Threads(2)
->MeasureProcessCPUTime()
->UseManualTime();
// ========================================================================= //
// BM_MainThreadAndWorkerThread
void BM_MainThreadAndWorkerThread(benchmark::State& state) {
for (auto _ : state) {
std::thread Worker(&MyBusySpinwait);
MyBusySpinwait();
Worker.join();
state.SetIterationTime(time_frame_in_sec);
}
state.counters["invtime"] =
benchmark::Counter{1, benchmark::Counter::kIsRate};
}
BENCHMARK(BM_MainThreadAndWorkerThread)->Iterations(1)->Threads(1);
BENCHMARK(BM_MainThreadAndWorkerThread)
->Iterations(1)
->Threads(1)
->UseRealTime();
BENCHMARK(BM_MainThreadAndWorkerThread)
->Iterations(1)
->Threads(1)
->UseManualTime();
BENCHMARK(BM_MainThreadAndWorkerThread)
->Iterations(1)
->Threads(1)
->MeasureProcessCPUTime();
BENCHMARK(BM_MainThreadAndWorkerThread)
->Iterations(1)
->Threads(1)
->MeasureProcessCPUTime()
->UseRealTime();
BENCHMARK(BM_MainThreadAndWorkerThread)
->Iterations(1)
->Threads(1)
->MeasureProcessCPUTime()
->UseManualTime();
BENCHMARK(BM_MainThreadAndWorkerThread)->Iterations(1)->Threads(2);
BENCHMARK(BM_MainThreadAndWorkerThread)
->Iterations(1)
->Threads(2)
->UseRealTime();
BENCHMARK(BM_MainThreadAndWorkerThread)
->Iterations(1)
->Threads(2)
->UseManualTime();
BENCHMARK(BM_MainThreadAndWorkerThread)
->Iterations(1)
->Threads(2)
->MeasureProcessCPUTime();
BENCHMARK(BM_MainThreadAndWorkerThread)
->Iterations(1)
->Threads(2)
->MeasureProcessCPUTime()
->UseRealTime();
BENCHMARK(BM_MainThreadAndWorkerThread)
->Iterations(1)
->Threads(2)
->MeasureProcessCPUTime()
->UseManualTime();
// ========================================================================= //
// ---------------------------- TEST CASES END ----------------------------- //
// ========================================================================= //
int main(int argc, char* argv[]) { RunOutputTests(argc, argv); }