diff --git a/CMakeLists.txt b/CMakeLists.txt index 2526faf4..4296b235 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -44,6 +44,10 @@ add_cxx_compiler_flag(-pedantic-errors) add_cxx_compiler_flag(-fno-strict-aliasing RELEASE) add_cxx_compiler_flag(-Wthread-safety) +if (HAVE_WTHREAD_SAFETY) + add_definitions(-DHAVE_WTHREAD_SAFETY) + cxx_feature_check(THREAD_SAFETY_ATTRIBUTES) +endif() # C++ feature checks cxx_feature_check(STD_REGEX) diff --git a/cmake/thread_safety_attributes.cpp b/cmake/thread_safety_attributes.cpp new file mode 100644 index 00000000..46161bab --- /dev/null +++ b/cmake/thread_safety_attributes.cpp @@ -0,0 +1,4 @@ +#define HAVE_THREAD_SAFETY_ATTRIBUTES +#include "../src/mutex.h" + +int main() {} diff --git a/include/benchmark/benchmark.h b/include/benchmark/benchmark.h index 5da915ea..50f27f0d 100644 --- a/include/benchmark/benchmark.h +++ b/include/benchmark/benchmark.h @@ -135,7 +135,8 @@ BENCHMARK(BM_MultiThreaded)->Threads(4); #ifndef BENCHMARK_BENCHMARK_H_ #define BENCHMARK_BENCHMARK_H_ -#include +#include +#include #include #include @@ -153,10 +154,7 @@ void Initialize(int* argc, const char** argv); // Otherwise, run all benchmarks specified by the --benchmark_filter flag, // and exit after running the benchmarks. -void RunSpecifiedBenchmarks(const BenchmarkReporter* reporter = nullptr); - -// ------------------------------------------------------ -// Routines that can be called from within a benchmark +void RunSpecifiedBenchmarks(const BenchmarkReporter* reporter = NULL); // If this routine is called, peak memory allocation past this point in the // benchmark is reported at the end of the benchmark report line. (It is @@ -165,14 +163,6 @@ void RunSpecifiedBenchmarks(const BenchmarkReporter* reporter = nullptr); // TODO(dominic) // void MemoryUsage(); -// If a particular benchmark is I/O bound, or if for some reason CPU -// timings are not representative, call this method from within the -// benchmark routine. If called, the elapsed time will be used to -// control how many iterations are run, and in the printing of -// items/second or MB/seconds values. If not called, the cpu time -// used by the benchmark will be used. -void UseRealTime(); - namespace internal { class Benchmark; class BenchmarkFamilies; @@ -181,13 +171,63 @@ class BenchmarkFamilies; // State is passed to a running Benchmark and contains state for the // benchmark to use. class State { - public: - // Returns true iff the benchmark should continue through another iteration. - bool KeepRunning(); +public: + State(size_t max_iters, bool has_x, int x, bool has_y, int y, int thread_i); + // Returns true iff the benchmark should continue through another iteration. + // NOTE: A benchmark may not return from the test until KeepRunning() has + // returned false. + bool KeepRunning() { + if (BENCHMARK_BUILTIN_EXPECT(!started_, false)) { + ResumeTiming(); + started_ = true; + } + bool const res = total_iterations_++ < max_iterations; + if (BENCHMARK_BUILTIN_EXPECT(!res, false)) { + assert(started_); + PauseTiming(); + // Total iterations now is one greater than max iterations. Fix this. + total_iterations_ = max_iterations; + } + return res; + } + + // REQUIRES: timer is running + // Stop the benchmark timer. If not called, the timer will be + // automatically stopped after KeepRunning() returns false for the first time. + // + // For threaded benchmarks the PauseTiming() function acts + // like a barrier. I.e., the ith call by a particular thread to this + // function will block until all threads have made their ith call. + // The timer will stop when the last thread has called this function. + // + // NOTE: PauseTiming()/ResumeTiming() are relatively + // heavyweight, and so their use should generally be avoided + // within each benchmark iteration, if possible. void PauseTiming(); + + // REQUIRES: timer is not running + // Start the benchmark timer. The timer is NOT running on entrance to the + // benchmark function. It begins running after the first call to KeepRunning() + // + // For threaded benchmarks the ResumeTiming() function acts + // like a barrier. I.e., the ith call by a particular thread to this + // function will block until all threads have made their ith call. + // The timer will start when the last thread has called this function. + // + // NOTE: PauseTiming()/ResumeTiming() are relatively + // heavyweight, and so their use should generally be avoided + // within each benchmark iteration, if possible. void ResumeTiming(); + // If a particular benchmark is I/O bound, or if for some reason CPU + // timings are not representative, call this method from within the + // benchmark routine. If called, the elapsed time will be used to + // control how many iterations are run, and in the printing of + // items/second or MB/seconds values. If not called, the cpu time + // used by the benchmark will be used. + void UseRealTime(); + // Set the number of bytes processed by the current benchmark // execution. This routine is typically called once at the end of a // throughput oriented benchmark. If this routine is called with a @@ -195,7 +235,15 @@ class State { // per iteration. // // REQUIRES: a benchmark has exited its KeepRunning loop. - void SetBytesProcessed(int64_t bytes); + BENCHMARK_ALWAYS_INLINE + void SetBytesProcessed(size_t bytes) { + bytes_processed_ = bytes; + } + + BENCHMARK_ALWAYS_INLINE + size_t bytes_processed() const { + return bytes_processed_; + } // If this routine is called with items > 0, then an items/s // label is printed on the benchmark report line for the currently @@ -203,94 +251,76 @@ class State { // benchmark where a processing items/second output is desired. // // REQUIRES: a benchmark has exited its KeepRunning loop. - void SetItemsProcessed(int64_t items); + BENCHMARK_ALWAYS_INLINE + void SetItemsProcessed(size_t items) { + items_processed_ = items; + } + + BENCHMARK_ALWAYS_INLINE + size_t items_processed() const { + return items_processed_; + } // If this routine is called, the specified label is printed at the // end of the benchmark report line for the currently executing // benchmark. Example: - // static void BM_Compress(benchmark::State& state) { + // static void BM_Compress(int iters) { // ... // double compress = input_size / output_size; - // state.SetLabel(StringPrintf("compress:%.1f%%", 100.0*compression)); + // benchmark::SetLabel(StringPrintf("compress:%.1f%%", 100.0*compression)); // } // Produces output that looks like: // BM_Compress 50 50 14115038 compress:27.3% // // REQUIRES: a benchmark has exited its KeepRunning loop. - void SetLabel(const std::string& label); + void SetLabel(const char* label); + + // Allow the use of std::string without actually including . + // This function does not participate in overload resolution unless StringType + // has the nested typename `basic_string`. This typename should be provided + // as an injected class name in the case of std::string. + template + void SetLabel(StringType const & str, + typename StringType::basic_string* = 0) { + this->SetLabel(str.c_str()); + } // Range arguments for this run. CHECKs if the argument has been set. - int range_x() const; - int range_y() const; + BENCHMARK_ALWAYS_INLINE + int range_x() const { + assert(has_range_x_); + ((void)has_range_x_); // Prevent unused warning. + return range_x_; + } - int64_t iterations() const { return total_iterations_; } + BENCHMARK_ALWAYS_INLINE + int range_y() const { + assert(has_range_y_); + ((void)has_range_y_); // Prevent unused warning. + return range_y_; + } + BENCHMARK_ALWAYS_INLINE + size_t iterations() const { return total_iterations_; } + +private: + bool started_; + size_t total_iterations_; + + bool has_range_x_; + int range_x_; + + bool has_range_y_; + int range_y_; + + size_t bytes_processed_; + size_t items_processed_; + +public: const int thread_index; + const size_t max_iterations; - private: - class FastClock; - struct SharedState; - struct ThreadStats; - - State(FastClock* clock, SharedState* s, int t); - bool StartRunning(); - bool FinishInterval(); - bool MaybeStop(); - void NewInterval(); - bool AllStarting(); - - static void* RunWrapper(void* arg); - void Run(); - void RunAsThread(); - void Wait(); - - enum EState { - STATE_INITIAL, // KeepRunning hasn't been called - STATE_STARTING, // KeepRunning called, waiting for other threads - STATE_RUNNING, // Running and being timed - STATE_STOPPING, // Not being timed but waiting for other threads - STATE_STOPPED // Stopped - }; - - EState state_; - - FastClock* clock_; - - // State shared by all BenchmarkRun objects that belong to the same - // BenchmarkInstance - SharedState* shared_; - - std::thread thread_; - - // Custom label set by the user. - std::string label_; - - // Each State object goes through a sequence of measurement intervals. By - // default each interval is approx. 100ms in length. The following stats are - // kept for each interval. - int64_t iterations_; - double start_cpu_; - double start_time_; - int64_t stop_time_micros_; - - double start_pause_cpu_; - double pause_cpu_time_; - double start_pause_real_; - double pause_real_time_; - - // Total number of iterations for all finished runs. - int64_t total_iterations_; - - // Approximate time in microseconds for one interval of execution. - // Dynamically adjusted as needed. - int64_t interval_micros_; - - // True if the current interval is the continuation of a previous one. - bool is_continuation_; - - std::unique_ptr stats_; - - friend class internal::Benchmark; +private: BENCHMARK_DISALLOW_COPY_AND_ASSIGN(State); }; @@ -304,7 +334,6 @@ class BenchmarkReporter { struct Context { int num_cpus; double mhz_per_cpu; - // std::string cpu_info; bool cpu_scaling_enabled; // The number of chars in the longest benchmark name. @@ -312,19 +341,17 @@ class BenchmarkReporter { }; struct Run { - Run() - : thread_index(-1), - iterations(1), - real_accumulated_time(0), - cpu_accumulated_time(0), - bytes_per_second(0), - items_per_second(0), - max_heapbytes_used(0) {} + Run() : + iterations(1), + real_accumulated_time(0), + cpu_accumulated_time(0), + bytes_per_second(0), + items_per_second(0), + max_heapbytes_used(0) {} std::string benchmark_name; - std::string report_label; - int thread_index; - int64_t iterations; + std::string report_label; // Empty if not set by benchmark. + size_t iterations; double real_accumulated_time; double cpu_accumulated_time; @@ -350,22 +377,12 @@ class BenchmarkReporter { // benchmark, thus have the same name. virtual void ReportRuns(const std::vector& report) const = 0; - virtual ~BenchmarkReporter() {} + virtual ~BenchmarkReporter(); }; namespace internal { -typedef std::function BenchmarkFunction; - -// Run all benchmarks whose name is a partial match for the regular -// expression in "spec". The results of benchmark runs are fed to "reporter". -void RunMatchingBenchmarks(const std::string& spec, - const BenchmarkReporter* reporter); - -// Extract the list of benchmark names that match the specified regular -// expression. -void FindMatchingBenchmarkNames(const std::string& re, - std::vector* benchmark_names); +typedef void(Function)(State&); // ------------------------------------------------------ // Benchmark registration object. The BENCHMARK() macro expands @@ -375,8 +392,7 @@ void FindMatchingBenchmarkNames(const std::string& re, // chained into one expression. class Benchmark { public: - // The Benchmark takes ownership of the Callback pointed to by f. - Benchmark(const char* name, BenchmarkFunction f); + Benchmark(const char* name, Function* f); ~Benchmark(); @@ -444,40 +460,25 @@ class Benchmark { // Used inside the benchmark implementation struct Instance; - // Measure the overhead of an empty benchmark to subtract later. - static void MeasureOverhead(); - private: - friend class BenchmarkFamilies; - - std::vector CreateBenchmarkInstances(size_t rangeXindex, - size_t rangeYindex); - std::string name_; - BenchmarkFunction function_; - size_t registration_index_; - std::vector rangeX_; - std::vector rangeY_; + Function* function_; + std::size_t registration_index_; + int arg_count_; + std::vector< std::pair > args_; // Args for all benchmark runs std::vector thread_counts_; - std::mutex mutex_; // Special value placed in thread_counts_ to stand for NumCPUs() static const int kNumCpuMarker = -1; - // Special value used to indicate that no range is required. - static const size_t kNoRangeIndex = std::numeric_limits::max(); - static const int kNoRange = std::numeric_limits::max(); - static void AddRange(std::vector* dst, int lo, int hi, int mult); - static double MeasurePeakHeapMemory(const Instance& b); - static void RunInstance(const Instance& b, const BenchmarkReporter* br); - friend class ::benchmark::State; - friend struct ::benchmark::internal::Benchmark::Instance; - friend void ::benchmark::internal::RunMatchingBenchmarks( - const std::string&, const BenchmarkReporter*); + + friend class BenchmarkFamilies; + BENCHMARK_DISALLOW_COPY_AND_ASSIGN(Benchmark); }; + // ------------------------------------------------------ // Internal implementation details follow; please ignore @@ -487,16 +488,16 @@ class ConsoleReporter : public BenchmarkReporter { public: virtual bool ReportContext(const Context& context) const; virtual void ReportRuns(const std::vector& reports) const; - private: - std::string PrintMemoryUsage(double bytes) const; virtual void PrintRunData(const Run& report) const; + // TODO(ericwf): Find a better way to share this information. mutable size_t name_field_width_; }; } // end namespace internal } // end namespace benchmark + // ------------------------------------------------------ // Macro to register benchmarks @@ -534,4 +535,11 @@ class ConsoleReporter : public BenchmarkReporter { __benchmark_, n, __LINE__) BENCHMARK_UNUSED = \ (new ::benchmark::internal::Benchmark(#n "<" #a "," #b ">", n)) +// Helper macro to create a main routine in a test that runs the benchmarks +#define BENCHMARK_MAIN() \ + int main(int argc, const char** argv) { \ + ::benchmark::Initialize(&argc, argv); \ + ::benchmark::RunSpecifiedBenchmarks(); \ + } + #endif // BENCHMARK_BENCHMARK_H_ diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 5f22510d..f3a825f2 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -2,9 +2,8 @@ include_directories(${PROJECT_SOURCE_DIR}/src) # Define the source files -set(SOURCE_FILES "benchmark.cc" "colorprint.cc" "commandlineflags.cc" - "log.cc" "sleep.cc" "string_util.cc" "sysinfo.cc" - "walltime.cc") +set(SOURCE_FILES "benchmark.cc" "colorprint.cc" "commandlineflags.cc" "log.cc" + "sleep.cc" "string_util.cc" "sysinfo.cc" "walltime.cc") # Determine the correct regular expression engine to use if(HAVE_STD_REGEX) set(RE_FILES "re_std.cc") diff --git a/src/benchmark.cc b/src/benchmark.cc index d4f6f1b3..8b0682e6 100644 --- a/src/benchmark.cc +++ b/src/benchmark.cc @@ -13,30 +13,30 @@ // limitations under the License. #include "benchmark/benchmark.h" -#include "arraysize.h" -#include "check.h" -#include "colorprint.h" -#include "commandlineflags.h" -#include "internal_macros.h" -#include "log.h" -#include "re.h" -#include "sleep.h" -#include "stat.h" -#include "string_util.h" -#include "sysinfo.h" -#include "walltime.h" #include -#include +#include +#include +#include +#include #include #include #include #include #include -#include #include -#include + +#include "check.h" +#include "commandlineflags.h" +#include "colorprint.h" +#include "log.h" +#include "mutex.h" +#include "re.h" +#include "stat.h" +#include "string_util.h" +#include "sysinfo.h" +#include "walltime.h" DEFINE_string(benchmark_filter, ".", "A regular expression that specifies the set of benchmarks " @@ -57,124 +57,121 @@ DEFINE_double(benchmark_min_time, 0.5, "of the benchmark execution, regardless of number of " "threads."); -DEFINE_bool(benchmark_memory_usage, false, - "Report memory usage for all benchmarks"); - DEFINE_int32(benchmark_repetitions, 1, "The number of runs of each benchmark. If greater than 1, the " "mean and standard deviation of the runs will be reported."); -DEFINE_int32(v, 0, "The level of verbose logging to output"); DEFINE_bool(color_print, true, "Enables colorized logging."); -// Will be non-empty if heap checking is turned on, which would -// invalidate any benchmarks. -DECLARE_string(heap_check); +DEFINE_int32(v, 0, "The level of verbose logging to output"); + // The ""'s catch people who don't pass in a literal for "str" #define strliterallen(str) (sizeof("" str "") - 1) // Must use a string literal for prefix. -#define memprefix(str, len, prefix) \ - ((((len) >= strliterallen(prefix)) && \ - memcmp(str, prefix, strliterallen(prefix)) == 0) \ - ? str + strliterallen(prefix) \ - : NULL) +#define memprefix(str, len, prefix) \ + ((((len) >= strliterallen(prefix)) && \ + std::memcmp(str, prefix, strliterallen(prefix)) == 0) \ + ? str + strliterallen(prefix) \ + : nullptr) + namespace benchmark { + +namespace internal { + +// NOTE: This is a dummy "mutex" type used to denote the actual mutex +// returned by GetBenchmarkLock(). This is only used to placate the thread +// safety warnings by giving the return of GetBenchmarkLock() a name. +struct CAPABILITY("mutex") BenchmarkLockType {}; +BenchmarkLockType BenchmarkLockVar; + +} // end namespace internal + +inline Mutex& RETURN_CAPABILITY(::benchmark::internal::BenchmarkLockVar) +GetBenchmarkLock() +{ + static Mutex lock; + return lock; +} + namespace { + // For non-dense Range, intermediate values are powers of kRangeMultiplier. static const int kRangeMultiplier = 8; - -std::mutex starting_mutex; -std::condition_variable starting_cv; +static const int kMaxIterations = 1000000000; bool running_benchmark = false; -// Should this benchmark report memory usage? -bool get_memory_usage; +// Global variable so that a benchmark can cause a little extra printing +std::string* GetReportLabel() { + static std::string label GUARDED_BY(GetBenchmarkLock()); + return &label; +} // Should this benchmark base decisions off of real time rather than // cpu time? -bool use_real_time; +bool use_real_time GUARDED_BY(GetBenchmarkLock()); -// Overhead of an empty benchmark. -double overhead = 0.0; +// TODO(ericwf): support MallocCounter. +//static benchmark::MallocCounter *benchmark_mc; -// Return prefix to print in front of each reported line -const char* Prefix() { -#ifdef NDEBUG - return ""; -#else - return "DEBUG: "; -#endif -} - -// TODO -// static internal::MallocCounter *benchmark_mc; - -bool CpuScalingEnabled() { +static bool CpuScalingEnabled() { // On Linux, the CPUfreq subsystem exposes CPU information as files on the // local file system. If reading the exported files fails, then we may not be // running on Linux, so we silently ignore all the read errors. for (int cpu = 0, num_cpus = NumCPUs(); cpu < num_cpus; ++cpu) { - std::stringstream ss; - ss << "/sys/devices/system/cpu/cpu" << cpu << "/cpufreq/scaling_governor"; - std::string governor_file = ss.str(); + std::string governor_file = StrCat("/sys/devices/system/cpu/cpu", cpu, + "/cpufreq/scaling_governor"); FILE* file = fopen(governor_file.c_str(), "r"); if (!file) break; char buff[16]; size_t bytes_read = fread(buff, 1, sizeof(buff), file); fclose(file); - if (memprefix(buff, bytes_read, "performance") == NULL) return true; + if (memprefix(buff, bytes_read, "performance") == nullptr) return true; } return false; } -// Given a collection of reports, computes their mean and stddev. -// REQUIRES: all runs in "reports" must be from the same benchmark. void ComputeStats(const std::vector& reports, BenchmarkReporter::Run* mean_data, BenchmarkReporter::Run* stddev_data) { + CHECK(reports.size() >= 2) << "Cannot compute stats for less than 2 reports"; // Accumulators. Stat1_d real_accumulated_time_stat; Stat1_d cpu_accumulated_time_stat; - Stat1_d items_per_second_stat; Stat1_d bytes_per_second_stat; - Stat1_d iterations_stat; - Stat1MinMax_d max_heapbytes_used_stat; + Stat1_d items_per_second_stat; + // All repetitions should be run with the same number of iterations so we + // can take this information from the first benchmark. + std::size_t const run_iterations = reports.front().iterations; // Populate the accumulators. - for (std::vector::const_iterator it = reports.begin(); - it != reports.end(); ++it) { - CHECK_EQ(reports[0].benchmark_name, it->benchmark_name); + for (BenchmarkReporter::Run const& run : reports) { + CHECK_EQ(reports[0].benchmark_name, run.benchmark_name); + CHECK_EQ(run_iterations, run.iterations); real_accumulated_time_stat += - Stat1_d(it->real_accumulated_time / it->iterations, it->iterations); + Stat1_d(run.real_accumulated_time/run.iterations, run.iterations); cpu_accumulated_time_stat += - Stat1_d(it->cpu_accumulated_time / it->iterations, it->iterations); - items_per_second_stat += Stat1_d(it->items_per_second, it->iterations); - bytes_per_second_stat += Stat1_d(it->bytes_per_second, it->iterations); - iterations_stat += Stat1_d(it->iterations, it->iterations); - max_heapbytes_used_stat += - Stat1MinMax_d(it->max_heapbytes_used, it->iterations); + Stat1_d(run.cpu_accumulated_time/run.iterations, run.iterations); + items_per_second_stat += Stat1_d(run.items_per_second, run.iterations); + bytes_per_second_stat += Stat1_d(run.bytes_per_second, run.iterations); } - // Get the data from the accumulator to BenchmarkRunData's. In the - // computations below we must multiply by the number of iterations since - // PrintRunData will divide by it. + // Get the data from the accumulator to BenchmarkReporter::Run's. mean_data->benchmark_name = reports[0].benchmark_name + "_mean"; - mean_data->iterations = iterations_stat.Mean(); + mean_data->iterations = run_iterations; mean_data->real_accumulated_time = real_accumulated_time_stat.Mean() * - mean_data->iterations; + run_iterations; mean_data->cpu_accumulated_time = cpu_accumulated_time_stat.Mean() * - mean_data->iterations; + run_iterations; mean_data->bytes_per_second = bytes_per_second_stat.Mean(); mean_data->items_per_second = items_per_second_stat.Mean(); - mean_data->max_heapbytes_used = max_heapbytes_used_stat.Max(); // Only add label to mean/stddev if it is same for all runs mean_data->report_label = reports[0].report_label; - for (size_t i = 1; i < reports.size(); i++) { + for (std::size_t i = 1; i < reports.size(); i++) { if (reports[i].report_label != reports[0].report_label) { mean_data->report_label = ""; break; @@ -183,29 +180,166 @@ void ComputeStats(const std::vector& reports, stddev_data->benchmark_name = reports[0].benchmark_name + "_stddev"; stddev_data->report_label = mean_data->report_label; - stddev_data->iterations = iterations_stat.StdDev(); - // The value of iterations_stat.StdDev() above may be 0 if all the repetitions - // have the same number of iterations. Blindly multiplying by 0 in the - // computation of real/cpu_accumulated_time below would lead to 0/0 in - // PrintRunData. So we skip the multiplication in this case and PrintRunData - // skips the division. - if (stddev_data->iterations == 0) { - stddev_data->real_accumulated_time = real_accumulated_time_stat.StdDev(); - stddev_data->cpu_accumulated_time = cpu_accumulated_time_stat.StdDev(); - } else { - stddev_data->real_accumulated_time = real_accumulated_time_stat.StdDev() * - stddev_data->iterations; - stddev_data->cpu_accumulated_time = cpu_accumulated_time_stat.StdDev() * - stddev_data->iterations; - } + stddev_data->iterations = 0; + stddev_data->real_accumulated_time = + real_accumulated_time_stat.StdDev(); + stddev_data->cpu_accumulated_time = + cpu_accumulated_time_stat.StdDev(); stddev_data->bytes_per_second = bytes_per_second_stat.StdDev(); stddev_data->items_per_second = items_per_second_stat.StdDev(); - stddev_data->max_heapbytes_used = max_heapbytes_used_stat.StdDev(); } -} // namespace + +struct ThreadStats { + ThreadStats() : bytes_processed(0), items_processed(0) {} + int64_t bytes_processed; + int64_t items_processed; +}; + +// Timer management class +class TimerManager { + public: + TimerManager(int num_threads, Notification* done) + : num_threads_(num_threads), + done_(done), + running_(false), + real_time_used_(0), + cpu_time_used_(0), + num_finalized_(0), + phase_number_(0), + entered_(0) { + } + + // Called by each thread + void StartTimer() EXCLUDES(lock_) { + bool last_thread = false; + { + MutexLock ml(lock_); + last_thread = Barrier(ml); + if (last_thread) { + CHECK(!running_) << "Called StartTimer when timer is already running"; + running_ = true; + start_real_time_ = walltime::Now(); + start_cpu_time_ = MyCPUUsage() + ChildrenCPUUsage(); + } + } + if (last_thread) { + phase_condition_.notify_all(); + } + } + + // Called by each thread + void StopTimer() EXCLUDES(lock_) { + bool last_thread = false; + { + MutexLock ml(lock_); + last_thread = Barrier(ml); + if (last_thread) { + CHECK(running_) << "Called StopTimer when timer is already stopped"; + InternalStop(); + } + } + if (last_thread) { + phase_condition_.notify_all(); + } + } + + // Called by each thread + void Finalize() EXCLUDES(lock_) { + MutexLock l(lock_); + num_finalized_++; + if (num_finalized_ == num_threads_) { + CHECK(!running_) << + "The timer should be stopped before the timer is finalized"; + done_->Notify(); + } + } + + // REQUIRES: timer is not running + double real_time_used() EXCLUDES(lock_) { + MutexLock l(lock_); + CHECK(!running_); + return real_time_used_; + } + + // REQUIRES: timer is not running + double cpu_time_used() EXCLUDES(lock_) { + MutexLock l(lock_); + CHECK(!running_); + return cpu_time_used_; + } + + private: + Mutex lock_; + Condition phase_condition_; + int num_threads_; + Notification* done_; + + bool running_; // Is the timer running + double start_real_time_; // If running_ + double start_cpu_time_; // If running_ + + // Accumulated time so far (does not contain current slice if running_) + double real_time_used_; + double cpu_time_used_; + + // How many threads have called Finalize() + int num_finalized_; + + // State for barrier management + int phase_number_; + int entered_; // Number of threads that have entered this barrier + + void InternalStop() REQUIRES(lock_) { + CHECK(running_); + running_ = false; + real_time_used_ += walltime::Now() - start_real_time_; + cpu_time_used_ += ((MyCPUUsage() + ChildrenCPUUsage()) + - start_cpu_time_); + } + + // Enter the barrier and wait until all other threads have also + // entered the barrier. Returns iff this is the last thread to + // enter the barrier. + bool Barrier(MutexLock& ml) REQUIRES(lock_) { + CHECK_LT(entered_, num_threads_); + entered_++; + if (entered_ < num_threads_) { + // Wait for all threads to enter + int phase_number_cp = phase_number_; + auto cb = [this, phase_number_cp]() { + return this->phase_number_ > phase_number_cp; + }; + phase_condition_.wait(ml.native_handle(), cb); + return false; // I was not the last one + } else { + // Last thread has reached the barrier + phase_number_++; + entered_ = 0; + return true; + } + } +}; + +// TimerManager for current run. +static std::unique_ptr timer_manager = nullptr; + +} // end namespace namespace internal { +// Information kept per benchmark we may want to run +struct Benchmark::Instance { + std::string name; + Function* function; + bool has_arg1; + int arg1; + bool has_arg2; + int arg2; + int threads; // Number of concurrent threads to use + bool multithreaded; // Is benchmark multi-threaded? +}; + + // Class for managing registered benchmarks. Note that each registered // benchmark identifies a family of related benchmarks to run. class BenchmarkFamilies { @@ -220,16 +354,17 @@ class BenchmarkFamilies { // Extract the list of benchmark instances that match the specified // regular expression. - void FindBenchmarks(const std::string& re, + bool FindBenchmarks(const std::string& re, std::vector* benchmarks); private: BenchmarkFamilies(); ~BenchmarkFamilies(); std::vector families_; - std::mutex mutex_; + Mutex mutex_; }; + BenchmarkFamilies* BenchmarkFamilies::GetInstance() { static BenchmarkFamilies instance; return &instance; @@ -244,7 +379,7 @@ BenchmarkFamilies::~BenchmarkFamilies() { } size_t BenchmarkFamilies::AddBenchmark(Benchmark* family) { - std::lock_guard l(mutex_); + MutexLock l(mutex_); // This loop attempts to reuse an entry that was previously removed to avoid // unncessary growth of the vector. for (size_t index = 0; index < families_.size(); ++index) { @@ -259,392 +394,133 @@ size_t BenchmarkFamilies::AddBenchmark(Benchmark* family) { } void BenchmarkFamilies::RemoveBenchmark(size_t index) { - std::lock_guard l(mutex_); - families_[index] = NULL; + MutexLock l(mutex_); + families_[index] = nullptr; // Don't shrink families_ here, we might be called by the destructor of // BenchmarkFamilies which iterates over the vector. } -void BenchmarkFamilies::FindBenchmarks( +bool BenchmarkFamilies::FindBenchmarks( const std::string& spec, std::vector* benchmarks) { // Make regular expression out of command-line flag + std::string error_msg; Regex re; - std::string re_error; - if (!re.Init(spec, &re_error)) { - std::cerr << "Could not compile benchmark re: " << re_error << std::endl; - return; + if (!re.Init(spec, &error_msg)) { + std::cerr << "Could not compile benchmark re: " << error_msg << std::endl; + return false; } - std::lock_guard l(mutex_); - for (internal::Benchmark* family : families_) { - if (family == nullptr) continue; // Family was deleted + // Special list of thread counts to use when none are specified + std::vector one_thread; + one_thread.push_back(1); - // Match against filter. - if (!re.Match(family->name_)) { - VLOG(1) << "Skipping " << family->name_ << "\n"; - continue; + MutexLock l(mutex_); + for (Benchmark* family : families_) { + // Family was deleted or benchmark doesn't match + if (family == nullptr || !re.Match(family->name_)) continue; + + if (family->arg_count_ == -1) { + family->arg_count_ = 0; + family->args_.emplace_back(-1, -1); } + for (auto const& args : family->args_) { + const std::vector* thread_counts = + (family->thread_counts_.empty() + ? &one_thread + : &family->thread_counts_); + for (int num_threads : *thread_counts) { - std::vector instances; - if (family->rangeX_.empty() && family->rangeY_.empty()) { - instances = family->CreateBenchmarkInstances( - Benchmark::kNoRangeIndex, Benchmark::kNoRangeIndex); - std::copy(instances.begin(), instances.end(), - std::back_inserter(*benchmarks)); - } else if (family->rangeY_.empty()) { - for (size_t x = 0; x < family->rangeX_.size(); ++x) { - instances = family->CreateBenchmarkInstances( - x, Benchmark::kNoRangeIndex); - std::copy(instances.begin(), instances.end(), - std::back_inserter(*benchmarks)); - } - } else { - for (size_t x = 0; x < family->rangeX_.size(); ++x) { - for (size_t y = 0; y < family->rangeY_.size(); ++y) { - instances = family->CreateBenchmarkInstances(x, y); - std::copy(instances.begin(), instances.end(), - std::back_inserter(*benchmarks)); + Benchmark::Instance instance; + instance.name = family->name_; + instance.function = family->function_; + instance.has_arg1 = family->arg_count_ >= 1; + instance.arg1 = args.first; + instance.has_arg2 = family->arg_count_ == 2; + instance.arg2 = args.second; + instance.threads = num_threads; + instance.multithreaded = !(family->thread_counts_.empty()); + + // Add arguments to instance name + if (family->arg_count_ >= 1) { + AppendHumanReadable(instance.arg1, &instance.name); } + if (family->arg_count_ >= 2) { + AppendHumanReadable(instance.arg2, &instance.name); + } + + // Add the number of threads used to the name + if (!family->thread_counts_.empty()) { + instance.name += StringPrintF("/threads:%d", instance.threads); + } + + benchmarks->push_back(instance); } } } -} - -std::string ConsoleReporter::PrintMemoryUsage(double bytes) const { - if (!get_memory_usage || bytes < 0.0) return ""; - - std::stringstream ss; - ss << " " << HumanReadableNumber(bytes) << "B peak-mem"; - return ss.str(); -} - -bool ConsoleReporter::ReportContext(const BenchmarkReporter::Context& context) - const { - name_field_width_ = context.name_field_width; - - std::cout << "Benchmarking on " << context.num_cpus << " X " - << context.mhz_per_cpu << " MHz CPU" - << ((context.num_cpus > 1) ? "s" : "") << "\n"; - - int remainder_ms; - std::cout << walltime::Print(walltime::Now(), "%Y/%m/%d-%H:%M:%S", - true, // use local timezone - &remainder_ms) << "\n"; - - // Show details of CPU model, caches, TLBs etc. - // if (!context.cpu_info.empty()) - // std::cout << "CPU: " << context.cpu_info.c_str(); - - if (context.cpu_scaling_enabled) { - std::cerr << "CPU scaling is enabled: Benchmark timings may be noisy.\n"; - } - - int output_width = fprintf(stdout, "%s%-*s %10s %10s %10s\n", - Prefix(), int(name_field_width_), "Benchmark", - "Time(ns)", "CPU(ns)", "Iterations"); - std::cout << std::string(output_width - 1, '-').c_str() << "\n"; - return true; } -void ConsoleReporter::ReportRuns( - const std::vector& reports) const { - for (std::vector::const_iterator it = reports.begin(); - it != reports.end(); ++it) { - CHECK_EQ(reports[0].benchmark_name, it->benchmark_name); - PrintRunData(*it); - } - // We don't report aggregated data if there was a single run. - if (reports.size() < 2) return; - - BenchmarkReporter::Run mean_data; - BenchmarkReporter::Run stddev_data; - ComputeStats(reports, &mean_data, &stddev_data); - - PrintRunData(mean_data); - PrintRunData(stddev_data); -} - -void ConsoleReporter::PrintRunData(const BenchmarkReporter::Run& result) const { - // Format bytes per second - std::string rate; - if (result.bytes_per_second > 0) { - std::stringstream ss; - ss << " " << HumanReadableNumber(result.bytes_per_second) << "B/s"; - rate = ss.str(); - } - - // Format items per second - std::string items; - if (result.items_per_second > 0) { - std::stringstream ss; - ss << " " << HumanReadableNumber(result.items_per_second) << " items/s"; - items = ss.str(); - } - - ColorPrintf(COLOR_DEFAULT, "%s", Prefix()); - ColorPrintf(COLOR_GREEN, "%-*s ", - name_field_width_, result.benchmark_name.c_str()); - if (result.iterations == 0) { - ColorPrintf(COLOR_YELLOW, "%10.0f %10.0f ", - result.real_accumulated_time * 1e9, - result.cpu_accumulated_time * 1e9); - } else { - ColorPrintf(COLOR_YELLOW, "%10.0f %10.0f ", - (result.real_accumulated_time * 1e9) / - (static_cast(result.iterations)), - (result.cpu_accumulated_time * 1e9) / - (static_cast(result.iterations))); - } - ColorPrintf(COLOR_CYAN, "%10lld", result.iterations); - ColorPrintf(COLOR_DEFAULT, "%*s %*s %s %s\n", - 13, rate.c_str(), - 18, items.c_str(), - result.report_label.c_str(), - PrintMemoryUsage(result.max_heapbytes_used).c_str()); -} - -/* TODO(dominic) -void MemoryUsage() { - // if (benchmark_mc) { - // benchmark_mc->Reset(); - //} else { - get_memory_usage = true; - //} -} -*/ - -void PrintUsageAndExit() { - fprintf(stdout, - "benchmark [--benchmark_filter=]\n" - " [--benchmark_iterations=]\n" - " [--benchmark_min_time=]\n" - //" [--benchmark_memory_usage]\n" - " [--benchmark_repetitions=]\n" - " [--color_print={true|false}]\n" - " [--v=]\n"); - exit(0); -} - -void ParseCommandLineFlags(int* argc, const char** argv) { - for (int i = 1; i < *argc; ++i) { - if (ParseStringFlag(argv[i], "benchmark_filter", &FLAGS_benchmark_filter) || - ParseInt32Flag(argv[i], "benchmark_iterations", - &FLAGS_benchmark_iterations) || - ParseDoubleFlag(argv[i], "benchmark_min_time", - &FLAGS_benchmark_min_time) || - // TODO(dominic) - // ParseBoolFlag(argv[i], "gbenchmark_memory_usage", - // &FLAGS_gbenchmark_memory_usage) || - ParseInt32Flag(argv[i], "benchmark_repetitions", - &FLAGS_benchmark_repetitions) || - ParseBoolFlag(argv[i], "color_print", &FLAGS_color_print) || - ParseInt32Flag(argv[i], "v", &FLAGS_v)) { - for (int j = i; j != *argc; ++j) argv[j] = argv[j + 1]; - - --(*argc); - --i; - } else if (IsFlag(argv[i], "help")) - PrintUsageAndExit(); - } -} - -} // end namespace internal - -// A clock that provides a fast mechanism to check if we're nearly done. -class State::FastClock { - public: - enum Type { - REAL_TIME, - CPU_TIME - }; - explicit FastClock(Type type) - : type_(type), - approx_time_(NowMicros()), - bg_done_(false), - bg_(BGThreadWrapper, this) { } - - ~FastClock() { - { - std::unique_lock l(bg_mutex_); - bg_done_ = true; - bg_cond_.notify_one(); - } - bg_.join(); - } - - // Returns true if the current time is guaranteed to be past "when_micros". - // This method is very fast. - inline bool HasReached(int64_t when_micros) { - return std::atomic_load(&approx_time_) >= when_micros; - } - - // Returns the current time in microseconds past the epoch. - int64_t NowMicros() const { - double t = 0; - switch (type_) { - case REAL_TIME: - t = walltime::Now(); - break; - case CPU_TIME: - t = MyCPUUsage() + ChildrenCPUUsage(); - break; - } - return static_cast(t * kNumMicrosPerSecond); - } - - // Reinitialize if necessary (since clock type may be change once benchmark - // function starts running - see UseRealTime). - void InitType(Type type) { - type_ = type; - std::lock_guard l(bg_mutex_); - std::atomic_store(&approx_time_, NowMicros()); - } - - private: - Type type_; - std::atomic approx_time_; // Last time measurement taken by bg_ - bool bg_done_; // This is used to signal background thread to exit - std::mutex bg_mutex_; - std::condition_variable bg_cond_; - std::thread bg_; // Background thread that updates last_time_ once every ms - - static void* BGThreadWrapper(void* that) { - ((FastClock*)that)->BGThread(); - return NULL; - } - - void BGThread() { - std::unique_lock l(bg_mutex_); - while (!bg_done_) - { - // Set timeout to 1 ms. - bg_cond_.wait_for(l, std::chrono::milliseconds(1)); - std::atomic_store(&approx_time_, NowMicros()); - } - } - - BENCHMARK_DISALLOW_COPY_AND_ASSIGN(FastClock); -}; - -struct State::ThreadStats { - int64_t bytes_processed; - int64_t items_processed; - - ThreadStats() { Reset(); } - - void Reset() { - bytes_processed = 0; - items_processed = 0; - } - - void Add(const ThreadStats& other) { - bytes_processed += other.bytes_processed; - items_processed += other.items_processed; - } -}; - -namespace internal { - -// Information kept per benchmark we may want to run -struct Benchmark::Instance { - Instance() - : bm(nullptr), - threads(1), - rangeXset(false), - rangeX(kNoRange), - rangeYset(false), - rangeY(kNoRange) {} - - std::string name; - Benchmark* bm; - int threads; // Number of concurrent threads to use - - bool rangeXset; - int rangeX; - bool rangeYset; - int rangeY; - - bool multithreaded() const { return !bm->thread_counts_.empty(); } -}; - -} // end namespace internal - -struct State::SharedState { - const internal::Benchmark::Instance* instance; - std::mutex mu; - std::condition_variable cond; - int starting; // Number of threads that have entered STARTING state - int stopping; // Number of threads that have entered STOPPING state - int exited; // Number of threads that have complete exited - int threads; // Number of total threads that are running concurrently - ThreadStats stats; - std::vector runs; // accumulated runs - std::string label; - - explicit SharedState(const internal::Benchmark::Instance* b) - : instance(b), - starting(0), - stopping(0), - exited(0), - threads(b == nullptr ? 1 : b->threads) { } - - BENCHMARK_DISALLOW_COPY_AND_ASSIGN(SharedState); -}; - -namespace internal { - -Benchmark::Benchmark(const char* name, BenchmarkFunction f) - : name_(name), function_(f) { +Benchmark::Benchmark(const char* name, + Function* f) + : name_(name), function_(f), arg_count_(-1) { registration_index_ = BenchmarkFamilies::GetInstance()->AddBenchmark(this); } -Benchmark::~Benchmark() { +Benchmark::~Benchmark() { BenchmarkFamilies::GetInstance()->RemoveBenchmark(registration_index_); } Benchmark* Benchmark::Arg(int x) { - std::lock_guard l(mutex_); - rangeX_.push_back(x); + CHECK(arg_count_ == -1 || arg_count_ == 1); + arg_count_ = 1; + args_.emplace_back(x, -1); return this; } Benchmark* Benchmark::Range(int start, int limit) { + CHECK(arg_count_ == -1 || arg_count_ == 1); + arg_count_ = 1; std::vector arglist; AddRange(&arglist, start, limit, kRangeMultiplier); - std::lock_guard l(mutex_); - for (size_t i = 0; i < arglist.size(); ++i) rangeX_.push_back(arglist[i]); + for (int i : arglist) { + args_.emplace_back(i, -1); + } return this; } Benchmark* Benchmark::DenseRange(int start, int limit) { + CHECK(arg_count_ == -1 || arg_count_ == 1); + arg_count_ = 1; CHECK_GE(start, 0); CHECK_LE(start, limit); - std::lock_guard l(mutex_); - for (int arg = start; arg <= limit; ++arg) rangeX_.push_back(arg); + for (int arg = start; arg <= limit; arg++) { + args_.emplace_back(arg, -1); + } return this; } Benchmark* Benchmark::ArgPair(int x, int y) { - std::lock_guard l(mutex_); - rangeX_.push_back(x); - rangeY_.push_back(y); + CHECK(arg_count_ == -1 || arg_count_ == 2); + arg_count_ = 2; + args_.emplace_back(x, y); return this; } Benchmark* Benchmark::RangePair(int lo1, int hi1, int lo2, int hi2) { + CHECK(arg_count_ == -1 || arg_count_ == 2); + arg_count_ = 2; std::vector arglist1, arglist2; AddRange(&arglist1, lo1, hi1, kRangeMultiplier); AddRange(&arglist2, lo2, hi2, kRangeMultiplier); - std::lock_guard l(mutex_); - rangeX_.resize(arglist1.size()); - std::copy(arglist1.begin(), arglist1.end(), rangeX_.begin()); - rangeY_.resize(arglist2.size()); - std::copy(arglist2.begin(), arglist2.end(), rangeY_.begin()); + for (int i : arglist1) { + for (int j : arglist2) { + args_.emplace_back(i, j); + } + } return this; } @@ -655,7 +531,6 @@ Benchmark* Benchmark::Apply(void (*custom_arguments)(Benchmark* benchmark)) { Benchmark* Benchmark::Threads(int t) { CHECK_GT(t, 0); - std::lock_guard l(mutex_); thread_counts_.push_back(t); return this; } @@ -664,14 +539,13 @@ Benchmark* Benchmark::ThreadRange(int min_threads, int max_threads) { CHECK_GT(min_threads, 0); CHECK_GE(max_threads, min_threads); - std::lock_guard l(mutex_); AddRange(&thread_counts_, min_threads, max_threads, 2); return this; } Benchmark* Benchmark::ThreadPerCpu() { - std::lock_guard l(mutex_); - thread_counts_.push_back(NumCPUs()); + static int num_cpus = NumCPUs(); + thread_counts_.push_back(num_cpus); return this; } @@ -682,443 +556,310 @@ void Benchmark::AddRange(std::vector* dst, int lo, int hi, int mult) { // Add "lo" dst->push_back(lo); + static const int kint32max = std::numeric_limits::max(); + // Now space out the benchmarks in multiples of "mult" - for (int32_t i = 1; i < std::numeric_limits::max() / mult; - i *= mult) { + for (int32_t i = 1; i < kint32max/mult; i *= mult) { if (i >= hi) break; - if (i > lo) dst->push_back(i); + if (i > lo) { + dst->push_back(i); + } } // Add "hi" (if different from "lo") - if (hi != lo) dst->push_back(hi); -} - -std::vector Benchmark::CreateBenchmarkInstances( - size_t rangeXindex, size_t rangeYindex) { - // Special list of thread counts to use when none are specified - std::vector one_thread; - one_thread.push_back(1); - - std::vector instances; - - const bool is_multithreaded = (!thread_counts_.empty()); - const std::vector& thread_counts = - (is_multithreaded ? thread_counts_ : one_thread); - for (int num_threads : thread_counts) { - Instance instance; - instance.name = name_; - instance.bm = this; - instance.threads = num_threads; - - if (rangeXindex != kNoRangeIndex) { - instance.rangeX = rangeX_[rangeXindex]; - instance.rangeXset = true; - AppendHumanReadable(instance.rangeX, &instance.name); - } - if (rangeYindex != kNoRangeIndex) { - instance.rangeY = rangeY_[rangeYindex]; - instance.rangeYset = true; - AppendHumanReadable(instance.rangeY, &instance.name); - } - - // Add the number of threads used to the name - if (is_multithreaded) { - std::stringstream ss; - ss << "/threads:" << instance.threads; - instance.name += ss.str(); - } - - instances.push_back(instance); + if (hi != lo) { + dst->push_back(hi); } - - return instances; } -void Benchmark::MeasureOverhead() { - State::FastClock clock(State::FastClock::CPU_TIME); - State::SharedState state(nullptr); - State runner(&clock, &state, 0); - while (runner.KeepRunning()) { - } - overhead = state.runs[0].real_accumulated_time / - static_cast(state.runs[0].iterations); - VLOG(1) << "Per-iteration overhead for doing nothing: " << overhead << "\n"; -} +} // end namespace internal -void Benchmark::RunInstance(const Instance& b, const BenchmarkReporter* br) { - use_real_time = false; - running_benchmark = true; - // get_memory_usage = FLAGS_gbenchmark_memory_usage; - State::FastClock clock(State::FastClock::CPU_TIME); +namespace { - // Initialize the test runners. - State::SharedState state(&b); + +// Execute one thread of benchmark b for the specified number of iterations. +// Adds the stats collected for the thread into *total. +void RunInThread(const benchmark::internal::Benchmark::Instance* b, + int iters, int thread_id, + ThreadStats* total) EXCLUDES(GetBenchmarkLock()) { + State st(iters, b->has_arg1, b->arg1, b->has_arg2, b->arg2, thread_id); + b->function(st); + CHECK(st.iterations() == st.max_iterations) << + "Benchmark returned before State::KeepRunning() returned false!"; { - std::vector> runners; - for (int i = 0; i < b.threads; ++i) - runners.push_back(std::unique_ptr(new State(&clock, &state, i))); - - // Run them all. - for (int i = 0; i < b.threads; ++i) { - if (b.multithreaded()) - runners[i]->RunAsThread(); - else - runners[i]->Run(); - } - if (b.multithreaded()) { - for (int i = 0; i < b.threads; ++i) runners[i]->Wait(); - } + MutexLock l(GetBenchmarkLock()); + total->bytes_processed += st.bytes_processed(); + total->items_processed += st.items_processed(); } - /* - double mem_usage = 0; - if (get_memory_usage) { - // Measure memory usage - Notification mem_done; - BenchmarkRun mem_run; - BenchmarkRun::SharedState mem_shared(&b, 1); - mem_run.Init(&clock, &mem_shared, 0); + + timer_manager->Finalize(); +} + +void RunBenchmark(const benchmark::internal::Benchmark::Instance& b, + const BenchmarkReporter* br) EXCLUDES(GetBenchmarkLock()) { + int iters = FLAGS_benchmark_iterations ? FLAGS_benchmark_iterations + : 1; + std::vector reports; + + std::vector pool; + if (b.multithreaded) + pool.resize(b.threads); + + for (int i = 0; i < FLAGS_benchmark_repetitions; i++) { + std::string mem; + while (true) { + // Try benchmark + VLOG(2) << "Running " << b.name << " for " << iters << "\n"; + { - testing::MallocCounter mc(testing::MallocCounter::THIS_THREAD_ONLY); - benchmark_mc = &mc; - mem_run.Run(&mem_done); - mem_done.WaitForNotification(); - benchmark_mc = NULL; - mem_usage = mc.PeakHeapGrowth(); + MutexLock l(GetBenchmarkLock()); + GetReportLabel()->clear(); + use_real_time = false; } - } - */ - running_benchmark = false; - for (BenchmarkReporter::Run& report : state.runs) { - double seconds = (use_real_time ? report.real_accumulated_time - : report.cpu_accumulated_time); - report.benchmark_name = b.name; - report.report_label = state.label; - report.bytes_per_second = state.stats.bytes_processed / seconds; - report.items_per_second = state.stats.items_processed / seconds; - report.max_heapbytes_used = MeasurePeakHeapMemory(b); - } + Notification done; + timer_manager = std::unique_ptr(new TimerManager(b.threads, &done)); - br->ReportRuns(state.runs); -} + ThreadStats total; + running_benchmark = true; + if (b.multithreaded) { + // If this is out first iteration of the while(true) loop then the + // threads haven't been started and can't be joined. Otherwise we need + // to join the thread before replacing them. + for (std::thread& thread : pool) { + if (thread.joinable()) + thread.join(); + } + for (std::size_t ti = 0; ti < pool.size(); ++ti) { + pool[ti] = std::thread(&RunInThread, &b, iters, ti, &total); + } + } else { + // Run directly in this thread + RunInThread(&b, iters, 0, &total); + } + done.WaitForNotification(); + running_benchmark = false; -// Run the specified benchmark, measure its peak memory usage, and -// return the peak memory usage. -double Benchmark::MeasurePeakHeapMemory(const Instance&) { - if (!get_memory_usage) return 0.0; - double bytes = 0.0; - /* TODO(dominich) - // Should we do multi-threaded runs? - const int num_threads = 1; - const int num_iters = 1; - { - // internal::MallocCounter mc(internal::MallocCounter::THIS_THREAD_ONLY); - running_benchmark = true; - timer_manager = new TimerManager(1, NULL); - // benchmark_mc = &mc; - timer_manager->StartTimer(); + const double cpu_accumulated_time = timer_manager->cpu_time_used(); + const double real_accumulated_time = timer_manager->real_time_used(); + timer_manager.reset(); - b.Run(num_iters); + VLOG(2) << "Ran in " << cpu_accumulated_time << "/" + << real_accumulated_time << "\n"; - running_benchmark = false; - delete timer_manager; - timer_manager = NULL; - // benchmark_mc = NULL; - // bytes = mc.PeakHeapGrowth(); - } - */ - return bytes; -} + // Base decisions off of real time if requested by this benchmark. + double seconds = cpu_accumulated_time; + std::string label; + { + MutexLock l(GetBenchmarkLock()); + label = *GetReportLabel(); + if (use_real_time) { + seconds = real_accumulated_time; + } + } -} // end namespace internal + // If this was the first run, was elapsed time or cpu time large enough? + // If this is not the first run, go with the current value of iter. + if ((i > 0) || + (iters == FLAGS_benchmark_iterations) || + (iters >= kMaxIterations) || + (seconds >= FLAGS_benchmark_min_time) || + (real_accumulated_time >= 5*FLAGS_benchmark_min_time)) { + double bytes_per_second = 0; + if (total.bytes_processed > 0 && seconds != 0.0) { + bytes_per_second = (total.bytes_processed / seconds); + } + double items_per_second = 0; + if (total.items_processed > 0 && seconds != 0.0) { + items_per_second = (total.items_processed / seconds); + } -State::State(FastClock* clock, SharedState* s, int t) - : thread_index(t), - state_(STATE_INITIAL), - clock_(clock), - shared_(s), - iterations_(0), - start_cpu_(0.0), - start_time_(0.0), - stop_time_micros_(0.0), - start_pause_cpu_(0.0), - pause_cpu_time_(0.0), - start_pause_real_(0.0), - pause_real_time_(0.0), - total_iterations_(0), - interval_micros_(static_cast(kNumMicrosPerSecond * - FLAGS_benchmark_min_time / - FLAGS_benchmark_repetitions)), - is_continuation_(false), - stats_(new ThreadStats()) { - CHECK(clock != nullptr); - CHECK(s != nullptr); -} + // Create report about this benchmark run. + BenchmarkReporter::Run report; + report.benchmark_name = b.name; + report.report_label = label; + // Report the total iterations across all threads. + report.iterations = static_cast(iters) * b.threads; + report.real_accumulated_time = real_accumulated_time; + report.cpu_accumulated_time = cpu_accumulated_time; + report.bytes_per_second = bytes_per_second; + report.items_per_second = items_per_second; + reports.push_back(report); + break; + } -bool State::KeepRunning() { - // Fast path - if ((FLAGS_benchmark_iterations == 0 && - !clock_->HasReached(stop_time_micros_ + - kNumMicrosPerSecond * pause_real_time_)) || - iterations_ < FLAGS_benchmark_iterations) { - ++iterations_; - return true; - } - - // To block thread 0 until all other threads exit, we have a signal exit - // point for KeepRunning() to return false. The fast path above always - // returns true. - bool ret = false; - switch (state_) { - case STATE_INITIAL: - ret = StartRunning(); - break; - case STATE_STARTING: - CHECK(false); - ret = true; - break; - case STATE_RUNNING: - ret = FinishInterval(); - break; - case STATE_STOPPING: - ret = MaybeStop(); - break; - case STATE_STOPPED: - CHECK(false); - ret = true; - break; - } - - if (!ret && shared_->threads > 1 && thread_index == 0){ - std::unique_lock l(shared_->mu); - - // Block until all other threads have exited. We can then safely cleanup - // without other threads continuing to access shared variables inside the - // user-provided run function. - while (shared_->exited < shared_->threads - 1) { - shared_->cond.wait(l); + // See how much iterations should be increased by + // Note: Avoid division by zero with max(seconds, 1ns). + double multiplier = FLAGS_benchmark_min_time * 1.4 / std::max(seconds, 1e-9); + // If our last run was at least 10% of FLAGS_benchmark_min_time then we + // use the multiplier directly. Otherwise we use at most 10 times + // expansion. + // NOTE: When the last run was at least 10% of the min time the max + // expansion should be 14x. + bool is_significant = (seconds / FLAGS_benchmark_min_time) > 0.1; + multiplier = is_significant ? multiplier : std::min(10.0, multiplier); + if (multiplier <= 1.0) multiplier = 2.0; + double next_iters = std::max(multiplier * iters, iters + 1.0); + if (next_iters > kMaxIterations) { + next_iters = kMaxIterations; + } + VLOG(3) << "Next iters: " << next_iters << ", " << multiplier << "\n"; + iters = static_cast(next_iters + 0.5); } } - - if (ret) { - ++iterations_; + br->ReportRuns(reports); + if (b.multithreaded) { + for (std::thread& thread : pool) + thread.join(); } - return ret; +} + +} // namespace + +State::State(size_t max_iters, bool has_x, int x, bool has_y, int y, + int thread_i) + : started_(false), total_iterations_(0), + has_range_x_(has_x), range_x_(x), + has_range_y_(has_y), range_y_(y), + bytes_processed_(0), items_processed_(0), + thread_index(thread_i), + max_iterations(max_iters) +{ + CHECK(max_iterations != 0) << "At least one iteration must be run"; } void State::PauseTiming() { - start_pause_cpu_ = MyCPUUsage() + ChildrenCPUUsage(); - start_pause_real_ = walltime::Now(); + // Add in time accumulated so far + CHECK(running_benchmark); + timer_manager->StopTimer(); } void State::ResumeTiming() { - pause_cpu_time_ += MyCPUUsage() + ChildrenCPUUsage() - start_pause_cpu_; - pause_real_time_ += walltime::Now() - start_pause_real_; + CHECK(running_benchmark); + timer_manager->StartTimer(); } -void State::SetBytesProcessed(int64_t bytes) { - CHECK_EQ(STATE_STOPPED, state_); - std::lock_guard l(shared_->mu); - stats_->bytes_processed = bytes; +void State::UseRealTime() { + MutexLock l(GetBenchmarkLock()); + use_real_time = true; } -void State::SetItemsProcessed(int64_t items) { - CHECK_EQ(STATE_STOPPED, state_); - std::lock_guard l(shared_->mu); - stats_->items_processed = items; +void State::SetLabel(const char* label) { + CHECK(running_benchmark); + MutexLock l(GetBenchmarkLock()); + *GetReportLabel() = label; } -void State::SetLabel(const std::string& label) { - CHECK_EQ(STATE_STOPPED, state_); - std::lock_guard l(shared_->mu); - shared_->label = label; -} - -int State::range_x() const { - CHECK(shared_->instance->rangeXset); - /* - << - "Failed to get range_x as it was not set. Did you register your " - "benchmark with a range parameter?"; - */ - return shared_->instance->rangeX; -} - -int State::range_y() const { - CHECK(shared_->instance->rangeYset); - /* << - "Failed to get range_y as it was not set. Did you register your " - "benchmark with a range parameter?"; - */ - return shared_->instance->rangeY; -} - -bool State::StartRunning() { - bool last_thread = false; - { - std::lock_guard l(shared_->mu); - CHECK_EQ(state_, STATE_INITIAL); - state_ = STATE_STARTING; - is_continuation_ = false; - CHECK_LT(shared_->starting, shared_->threads); - ++shared_->starting; - last_thread = shared_->starting == shared_->threads; - } - - if (last_thread) { - clock_->InitType(use_real_time ? FastClock::REAL_TIME - : FastClock::CPU_TIME); - { - std::lock_guard l(starting_mutex); - starting_cv.notify_all(); - } - } else { - std::unique_lock l(starting_mutex); - starting_cv.wait(l); - } - CHECK_EQ(state_, STATE_STARTING); - state_ = STATE_RUNNING; - - NewInterval(); - return true; -} - -void State::NewInterval() { - stop_time_micros_ = clock_->NowMicros() + interval_micros_; - if (!is_continuation_) { - VLOG(1) << "Starting new interval; stopping in " << interval_micros_ - << "\n"; - iterations_ = 0; - pause_cpu_time_ = 0; - pause_real_time_ = 0; - start_cpu_ = MyCPUUsage() + ChildrenCPUUsage(); - start_time_ = walltime::Now(); - } else { - VLOG(1) << "Continuing interval; stopping in " << interval_micros_ - << "\n"; - } -} - -bool State::FinishInterval() { - if ((FLAGS_benchmark_iterations != 0 && - iterations_ < - FLAGS_benchmark_iterations / FLAGS_benchmark_repetitions) || - iterations_ < 1) { - interval_micros_ *= 2; - VLOG(1) << "Not enough iterations in interval; " - << "Trying again for " << interval_micros_ << " useconds.\n"; - is_continuation_ = false; - NewInterval(); - return true; - } - - BenchmarkReporter::Run data; - data.iterations = iterations_; - data.thread_index = thread_index; - - const double accumulated_time = walltime::Now() - start_time_; - const double total_overhead = overhead * iterations_; - CHECK_LT(pause_real_time_, accumulated_time); - CHECK_LT(pause_real_time_ + total_overhead, accumulated_time); - data.real_accumulated_time = - accumulated_time - (pause_real_time_ + total_overhead); - data.cpu_accumulated_time = (MyCPUUsage() + ChildrenCPUUsage()) - - (pause_cpu_time_ + start_cpu_); - total_iterations_ += iterations_; - - bool keep_going = false; - { - std::lock_guard l(shared_->mu); - - // Either replace the last or add a new data point. - if (is_continuation_) - shared_->runs.back() = data; - else - shared_->runs.push_back(data); - - if (FLAGS_benchmark_iterations != 0) { - // If we need more iterations, run another interval as a continuation. - keep_going = total_iterations_ < FLAGS_benchmark_iterations; - is_continuation_ = keep_going; - } else { - // If this is a repetition, run another interval as a new data point. - keep_going = shared_->runs.size() < - static_cast(FLAGS_benchmark_repetitions); - is_continuation_ = !keep_going; - } - - if (!keep_going) { - ++shared_->stopping; - if (shared_->stopping < shared_->threads) { - // Other threads are still running, so continue running but without - // timing to present an expected background load to the other threads. - state_ = STATE_STOPPING; - keep_going = true; - } else { - state_ = STATE_STOPPED; - } - } - } - - if (state_ == STATE_RUNNING) NewInterval(); - return keep_going; -} - -bool State::MaybeStop() { - std::lock_guard l(shared_->mu); - if (shared_->stopping < shared_->threads) { - CHECK_EQ(state_, STATE_STOPPING); - return true; - } - state_ = STATE_STOPPED; - return false; -} - -void State::Run() { - stats_->Reset(); - shared_->instance->bm->function_(*this); - { - std::lock_guard l(shared_->mu); - shared_->stats.Add(*stats_); - } -} - -void State::RunAsThread() { - thread_ = std::thread(State::RunWrapper, this); -} - -void State::Wait() { - if (thread_.joinable()) { - thread_.join(); - } -} - -// static -void* State::RunWrapper(void* arg) { - State* that = (State*)arg; - CHECK(that != nullptr); - that->Run(); - - std::lock_guard l(that->shared_->mu); - - that->shared_->exited++; - if (that->thread_index > 0 && - that->shared_->exited == that->shared_->threads - 1) { - // All threads but thread 0 have exited the user-provided run function. - // Thread 0 can now wake up and exit. - that->shared_->cond.notify_one(); - } - - return nullptr; -} +BenchmarkReporter::~BenchmarkReporter() {} namespace internal { +bool ConsoleReporter::ReportContext(const Context& context) const { + name_field_width_ = context.name_field_width; + + fprintf(stdout, + "Run on (%d X %0.0f MHz CPU%s)\n", + context.num_cpus, + context.mhz_per_cpu, + (context.num_cpus > 1) ? "s" : ""); + + int remainder_us; + std::string walltime_str = walltime::Print( + walltime::Now(), "%Y/%m/%d-%H:%M:%S", + true, // use local timezone + &remainder_us); + fprintf(stdout, "%s\n", walltime_str.c_str()); + + if (context.cpu_scaling_enabled) { + fprintf(stdout, "***WARNING*** CPU scaling is enabled, the benchmark " + "timings may be noisy\n"); + } + +#ifndef NDEBUG + fprintf(stdout, "Build Type: DEBUG\n"); +#endif + + int output_width = + fprintf(stdout, + "%-*s %10s %10s %10s\n", + static_cast(name_field_width_), + "Benchmark", + "Time(ns)", "CPU(ns)", + "Iterations"); + fprintf(stdout, "%s\n", std::string(output_width - 1, '-').c_str()); + + return true; +} + +void ConsoleReporter::ReportRuns( + const std::vector& reports) const { + if (reports.empty()) { + return; + } + + for (Run const& run : reports) { + CHECK_EQ(reports[0].benchmark_name, run.benchmark_name); + PrintRunData(run); + } + + if (reports.size() < 2) { + // We don't report aggregated data if there was a single run. + return; + } + + Run mean_data; + Run stddev_data; + ComputeStats(reports, &mean_data, &stddev_data); + + // Output using PrintRun. + PrintRunData(mean_data); + PrintRunData(stddev_data); + fprintf(stdout, "\n"); +} + +void ConsoleReporter::PrintRunData(const Run& result) const { + // Format bytes per second + std::string rate; + if (result.bytes_per_second > 0) { + rate = StrCat(" ", HumanReadableNumber(result.bytes_per_second), "B/s"); + } + + // Format items per second + std::string items; + if (result.items_per_second > 0) { + items = StrCat(" ", HumanReadableNumber(result.items_per_second), + " items/s"); + } + + double const multiplier = 1e9; // nano second multiplier + ColorPrintf(COLOR_GREEN, "%-*s ", + name_field_width_, result.benchmark_name.c_str()); + if (result.iterations == 0) { + ColorPrintf(COLOR_YELLOW, "%10.0f %10.0f ", + result.real_accumulated_time * multiplier, + result.cpu_accumulated_time * multiplier); + } else { + ColorPrintf(COLOR_YELLOW, "%10.0f %10.0f ", + (result.real_accumulated_time * multiplier) / + (static_cast(result.iterations)), + (result.cpu_accumulated_time * multiplier) / + (static_cast(result.iterations))); + } + ColorPrintf(COLOR_CYAN, "%10lld", result.iterations); + ColorPrintf(COLOR_DEFAULT, "%*s %*s %s\n", + 13, rate.c_str(), + 18, items.c_str(), + result.report_label.c_str()); +} + void RunMatchingBenchmarks(const std::string& spec, const BenchmarkReporter* reporter) { + CHECK(reporter != nullptr); if (spec.empty()) return; - std::vector benchmarks; - BenchmarkFamilies::GetInstance()->FindBenchmarks(spec, &benchmarks); + std::vector benchmarks; + auto families = benchmark::internal::BenchmarkFamilies::GetInstance(); + if (!families->FindBenchmarks(spec, &benchmarks)) return; + // Determine the width of the name field using a minimum width of 10. // Also determine max number of threads needed. @@ -1144,45 +885,78 @@ void RunMatchingBenchmarks(const std::string& spec, BenchmarkReporter::Context context; context.num_cpus = NumCPUs(); context.mhz_per_cpu = CyclesPerSecond() / 1000000.0f; - // context.cpu_info = base::CompactCPUIDInfoString(); + context.cpu_scaling_enabled = CpuScalingEnabled(); context.name_field_width = name_field_width; - if (reporter->ReportContext(context)) - for (internal::Benchmark::Instance& benchmark : benchmarks) - Benchmark::RunInstance(benchmark, reporter); + if (reporter->ReportContext(context)) { + for (const auto& benchmark : benchmarks) { + RunBenchmark(benchmark, reporter); + } + } } -void FindMatchingBenchmarkNames(const std::string& spec, - std::vector* benchmark_names) { - if (spec.empty()) return; +} // end namespace internal - std::vector benchmarks; - BenchmarkFamilies::GetInstance()->FindBenchmarks(spec, &benchmarks); - std::transform(benchmarks.begin(), benchmarks.end(), benchmark_names->begin(), - [](const internal::Benchmark::Instance& b) { return b.name; }); -} -} // end namespace internal - -void RunSpecifiedBenchmarks(const BenchmarkReporter* reporter /*= nullptr*/) { +void RunSpecifiedBenchmarks(const BenchmarkReporter* reporter) { std::string spec = FLAGS_benchmark_filter; if (spec.empty() || spec == "all") spec = "."; // Regexp that matches all benchmarks internal::ConsoleReporter default_reporter; - internal::RunMatchingBenchmarks( - spec, reporter == nullptr ? &default_reporter : reporter); + internal::RunMatchingBenchmarks(spec, reporter ? reporter : &default_reporter); } -void UseRealTime() { use_real_time = true; } +namespace internal { + +void PrintUsageAndExit() { + fprintf(stdout, + "benchmark" + " [--benchmark_filter=]\n" + " [--benchmark_iterations=]\n" + " [--benchmark_min_time=]\n" + " [--benchmark_repetitions=]\n" + " [--color_print={true|false}]\n" + " [--v=]\n"); + exit(0); +} + +void ParseCommandLineFlags(int* argc, const char** argv) { + using namespace benchmark; + for (int i = 1; i < *argc; ++i) { + if ( + ParseStringFlag(argv[i], "benchmark_filter", + &FLAGS_benchmark_filter) || + ParseInt32Flag(argv[i], "benchmark_iterations", + &FLAGS_benchmark_iterations) || + ParseDoubleFlag(argv[i], "benchmark_min_time", + &FLAGS_benchmark_min_time) || + ParseInt32Flag(argv[i], "benchmark_repetitions", + &FLAGS_benchmark_repetitions) || + ParseBoolFlag(argv[i], "color_print", + &FLAGS_color_print) || + ParseInt32Flag(argv[i], "v", &FLAGS_v)) { + for (int j = i; j != *argc; ++j) argv[j] = argv[j + 1]; + + --(*argc); + --i; + } else if (IsFlag(argv[i], "help")) { + PrintUsageAndExit(); + } + } +} + +} // end namespace internal void Initialize(int* argc, const char** argv) { internal::ParseCommandLineFlags(argc, argv); internal::SetLogLevel(FLAGS_v); - // Ensure walltime is initialized by a single thread by forcing the - // initialization. + // TODO remove this. It prints some output the first time it is called. + // We don't want to have this ouput printed during benchmarking. + MyCPUUsage(); + // The first call to walltime::Now initialized it. Call it once to + // prevent the initialization from happening in a benchmark. walltime::Now(); - internal::Benchmark::MeasureOverhead(); } -} // end namespace benchmark +} // end namespace benchmark diff --git a/src/mutex.h b/src/mutex.h new file mode 100644 index 00000000..f37ec35b --- /dev/null +++ b/src/mutex.h @@ -0,0 +1,142 @@ +#ifndef BENCHMARK_MUTEX_H_ +#define BENCHMARK_MUTEX_H_ + +#include +#include + +// Enable thread safety attributes only with clang. +// The attributes can be safely erased when compiling with other compilers. +#if defined(HAVE_THREAD_SAFETY_ATTRIBUTES) +#define THREAD_ANNOTATION_ATTRIBUTE__(x) __attribute__((x)) +#else +#define THREAD_ANNOTATION_ATTRIBUTE__(x) // no-op +#endif + +#define CAPABILITY(x) \ + THREAD_ANNOTATION_ATTRIBUTE__(capability(x)) + +#define SCOPED_CAPABILITY \ + THREAD_ANNOTATION_ATTRIBUTE__(scoped_lockable) + +#define GUARDED_BY(x) \ + THREAD_ANNOTATION_ATTRIBUTE__(guarded_by(x)) + +#define PT_GUARDED_BY(x) \ + THREAD_ANNOTATION_ATTRIBUTE__(pt_guarded_by(x)) + +#define ACQUIRED_BEFORE(...) \ + THREAD_ANNOTATION_ATTRIBUTE__(acquired_before(__VA_ARGS__)) + +#define ACQUIRED_AFTER(...) \ + THREAD_ANNOTATION_ATTRIBUTE__(acquired_after(__VA_ARGS__)) + +#define REQUIRES(...) \ + THREAD_ANNOTATION_ATTRIBUTE__(requires_capability(__VA_ARGS__)) + +#define REQUIRES_SHARED(...) \ + THREAD_ANNOTATION_ATTRIBUTE__(requires_shared_capability(__VA_ARGS__)) + +#define ACQUIRE(...) \ + THREAD_ANNOTATION_ATTRIBUTE__(acquire_capability(__VA_ARGS__)) + +#define ACQUIRE_SHARED(...) \ + THREAD_ANNOTATION_ATTRIBUTE__(acquire_shared_capability(__VA_ARGS__)) + +#define RELEASE(...) \ + THREAD_ANNOTATION_ATTRIBUTE__(release_capability(__VA_ARGS__)) + +#define RELEASE_SHARED(...) \ + THREAD_ANNOTATION_ATTRIBUTE__(release_shared_capability(__VA_ARGS__)) + +#define TRY_ACQUIRE(...) \ + THREAD_ANNOTATION_ATTRIBUTE__(try_acquire_capability(__VA_ARGS__)) + +#define TRY_ACQUIRE_SHARED(...) \ + THREAD_ANNOTATION_ATTRIBUTE__(try_acquire_shared_capability(__VA_ARGS__)) + +#define EXCLUDES(...) \ + THREAD_ANNOTATION_ATTRIBUTE__(locks_excluded(__VA_ARGS__)) + +#define ASSERT_CAPABILITY(x) \ + THREAD_ANNOTATION_ATTRIBUTE__(assert_capability(x)) + +#define ASSERT_SHARED_CAPABILITY(x) \ + THREAD_ANNOTATION_ATTRIBUTE__(assert_shared_capability(x)) + +#define RETURN_CAPABILITY(x) \ + THREAD_ANNOTATION_ATTRIBUTE__(lock_returned(x)) + +#define NO_THREAD_SAFETY_ANALYSIS \ + THREAD_ANNOTATION_ATTRIBUTE__(no_thread_safety_analysis) + + +namespace benchmark { + +typedef std::condition_variable Condition; + +// NOTE: Wrappers for std::mutex and std::unique_lock are provided so that +// we can annotate them with thread safety attributes and use the +// -Wthread-safety warning with clang. The standard library types cannot be +// used directly because they do not provided the required annotations. +class CAPABILITY("mutex") Mutex +{ +public: + Mutex() {} + + void lock() ACQUIRE() { mut_.lock(); } + void unlock() RELEASE() { mut_.unlock(); } + std::mutex& native_handle() { + return mut_; + } +private: + std::mutex mut_; +}; + + +class SCOPED_CAPABILITY MutexLock +{ + typedef std::unique_lock MutexLockImp; +public: + MutexLock(Mutex& m) ACQUIRE(m) : ml_(m.native_handle()) + { } + ~MutexLock() RELEASE() {} + MutexLockImp& native_handle() { return ml_; } +private: + MutexLockImp ml_; +}; + + +class Notification +{ +public: + Notification() : notified_yet_(false) { } + + void WaitForNotification() const EXCLUDES(mutex_) { + MutexLock m_lock(mutex_); + auto notified_fn = [this]() REQUIRES(mutex_) { + return this->HasBeenNotified(); + }; + cv_.wait(m_lock.native_handle(), notified_fn); + } + + void Notify() EXCLUDES(mutex_) { + { + MutexLock lock(mutex_); + notified_yet_ = 1; + } + cv_.notify_all(); + } + +private: + bool HasBeenNotified() const REQUIRES(mutex_) { + return notified_yet_; + } + + mutable Mutex mutex_; + mutable std::condition_variable cv_; + bool notified_yet_ GUARDED_BY(mutex_); +}; + +} // end namespace benchmark + +#endif // BENCHMARK_MUTEX_H_ diff --git a/src/string_util.cc b/src/string_util.cc index 1be15341..ee1badc8 100644 --- a/src/string_util.cc +++ b/src/string_util.cc @@ -24,13 +24,13 @@ static_assert(arraysize(kBigSIUnits) == arraysize(kBigIECUnits), static_assert(arraysize(kSmallSIUnits) == arraysize(kBigSIUnits), "Small SI and Big SI unit arrays must be the same size"); -static const int kUnitsSize = arraysize(kBigSIUnits); +static const int64_t kUnitsSize = arraysize(kBigSIUnits); } // end anonymous namespace void ToExponentAndMantissa(double val, double thresh, int precision, double one_k, std::string* mantissa, - int* exponent) { + int64_t* exponent) { std::stringstream mantissa_stream; if (val < 0) { @@ -80,10 +80,10 @@ void ToExponentAndMantissa(double val, double thresh, int precision, *mantissa = mantissa_stream.str(); } -std::string ExponentToPrefix(int exponent, bool iec) { +std::string ExponentToPrefix(int64_t exponent, bool iec) { if (exponent == 0) return ""; - const int index = (exponent > 0 ? exponent - 1 : -exponent - 1); + const int64_t index = (exponent > 0 ? exponent - 1 : -exponent - 1); if (index >= kUnitsSize) return ""; const char* array = @@ -97,7 +97,7 @@ std::string ExponentToPrefix(int exponent, bool iec) { std::string ToBinaryStringFullySpecified(double value, double threshold, int precision) { std::string mantissa; - int exponent; + int64_t exponent; ToExponentAndMantissa(value, threshold, precision, 1024.0, &mantissa, &exponent); return mantissa + ExponentToPrefix(exponent, false); diff --git a/src/sysinfo.cc b/src/sysinfo.cc index ee3c238e..ace7caa4 100644 --- a/src/sysinfo.cc +++ b/src/sysinfo.cc @@ -34,6 +34,7 @@ #include "check.h" #include "cycleclock.h" #include "internal_macros.h" +#include "log.h" #include "sleep.h" namespace benchmark { @@ -322,7 +323,7 @@ double MyCPUUsage() { return value; } // Once MyCPUUsageCPUTimeNsLocked fails once fall back to getrusage(). - std::cout << "Reading /proc/self/cputime_ns failed. Using getrusage().\n"; + VLOG(1) << "Reading /proc/self/cputime_ns failed. Using getrusage().\n"; use_cputime_ns = false; } } diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 5d4721be..bc62f432 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -20,3 +20,6 @@ add_test(filter_regex_none filter_test --benchmark_filter=monkey 0) add_test(filter_regex_wildcard filter_test --benchmark_filter=.*Calculate.* 16) add_test(filter_regex_begin filter_test --benchmark_filter=^BM_Calculate.* 16) add_test(filter_regex_end filter_test --benchmark_filter=.*Pi$ 8) + +compile_benchmark_test(basic_test) +add_test(basic basic_test) diff --git a/test/basic_test.cc b/test/basic_test.cc new file mode 100644 index 00000000..d14f577a --- /dev/null +++ b/test/basic_test.cc @@ -0,0 +1,105 @@ + +#include + +#include "benchmark/benchmark.h" + +#define BASIC_BENCHMARK_TEST(x) \ + BENCHMARK(x)->Arg(8)->Arg(512)->Arg(8192) + +void BM_empty(benchmark::State& state) { + while (state.KeepRunning()) { + volatile std::size_t x = state.iterations(); + ((void)x); + } +} +BENCHMARK(BM_empty); +BENCHMARK(BM_empty)->ThreadPerCpu(); + +void BM_spin_empty(benchmark::State& state) { + while (state.KeepRunning()) { + for (int x = 0; x < state.range_x(); ++x) { + volatile int dummy = x; + ((void)dummy); + } + } +} +BASIC_BENCHMARK_TEST(BM_spin_empty); +BASIC_BENCHMARK_TEST(BM_spin_empty)->ThreadPerCpu(); + +void BM_spin_pause_before(benchmark::State& state) { + for (int i = 0; i < state.range_x(); ++i) { + volatile int dummy = i; + ((void)dummy); + } + while(state.KeepRunning()) { + for (int i = 0; i < state.range_x(); ++i) { + volatile int dummy = i; + ((void)dummy); + } + } +} +BASIC_BENCHMARK_TEST(BM_spin_pause_before); +BASIC_BENCHMARK_TEST(BM_spin_pause_before)->ThreadPerCpu(); + + +void BM_spin_pause_during(benchmark::State& state) { + while(state.KeepRunning()) { + state.PauseTiming(); + for (int i = 0; i < state.range_x(); ++i) { + volatile int dummy = i; + ((void)dummy); + } + state.ResumeTiming(); + for (int i = 0; i < state.range_x(); ++i) { + volatile int dummy = i; + ((void)dummy); + } + } +} +BASIC_BENCHMARK_TEST(BM_spin_pause_during); +BASIC_BENCHMARK_TEST(BM_spin_pause_during)->ThreadPerCpu(); + + +void BM_spin_pause_after(benchmark::State& state) { + while(state.KeepRunning()) { + for (int i = 0; i < state.range_x(); ++i) { + volatile int dummy = i; + ((void)dummy); + } + } + for (int i = 0; i < state.range_x(); ++i) { + volatile int dummy = i; + ((void)dummy); + } +} +BASIC_BENCHMARK_TEST(BM_spin_pause_after); +BASIC_BENCHMARK_TEST(BM_spin_pause_after)->ThreadPerCpu(); + + +void BM_spin_pause_before_and_after(benchmark::State& state) { + for (int i = 0; i < state.range_x(); ++i) { + volatile int dummy = i; + ((void)dummy); + } + while(state.KeepRunning()) { + for (int i = 0; i < state.range_x(); ++i) { + volatile int dummy = i; + ((void)dummy); + } + } + for (int i = 0; i < state.range_x(); ++i) { + volatile int dummy = i; + ((void)dummy); + } +} +BASIC_BENCHMARK_TEST(BM_spin_pause_before_and_after); +BASIC_BENCHMARK_TEST(BM_spin_pause_before_and_after)->ThreadPerCpu(); + + +void BM_empty_stop_start(benchmark::State& state) { + while (state.KeepRunning()) { } +} +BENCHMARK(BM_empty_stop_start); +BENCHMARK(BM_empty_stop_start)->ThreadPerCpu(); + +BENCHMARK_MAIN() diff --git a/test/benchmark_test.cc b/test/benchmark_test.cc index 2ce1001d..d44ea319 100644 --- a/test/benchmark_test.cc +++ b/test/benchmark_test.cc @@ -53,18 +53,22 @@ static void BM_Factorial(benchmark::State& state) { while (state.KeepRunning()) fac_42 = Factorial(8); // Prevent compiler optimizations - std::cout << fac_42; + std::stringstream ss; + ss << fac_42; + state.SetLabel(ss.str()); } BENCHMARK(BM_Factorial); static void BM_FactorialRealTime(benchmark::State& state) { - benchmark::UseRealTime(); + state.UseRealTime(); int fac_42 = 0; while (state.KeepRunning()) fac_42 = Factorial(8); // Prevent compiler optimizations - std::cout << fac_42; + std::stringstream ss; + ss << fac_42; + state.SetLabel(ss.str()); } BENCHMARK(BM_FactorialRealTime); @@ -158,12 +162,5 @@ static void BM_LongTest(benchmark::State& state) { } BENCHMARK(BM_LongTest)->Range(1<<16,1<<28); -int main(int argc, const char* argv[]) { - benchmark::Initialize(&argc, argv); - - assert(Factorial(8) == 40320); - assert(CalculatePi(1) == 0.0); - - benchmark::RunSpecifiedBenchmarks(); -} +BENCHMARK_MAIN()