commit 403f35442375f2ee858981b79421ca321645df08 Author: Dominic Hamon Date: Wed Dec 18 16:55:45 2013 -0800 Initial commit Benchmark library builds and runs but only single-threaded. Multithreaded support needs a bit more love. Currently requires some C++11 support (g++ 4.6.3 seems to work). diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..5dfc6384 --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +CMakeCache.txt +CMakeFiles/ +Makefile +bin/ +cmake_install.cmake +lib/ diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 00000000..f363a476 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,43 @@ +cmake_minimum_required (VERSION 2.8) +project (benchmark) + +find_package(Threads) + +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR}/bin) +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR}/lib) +set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR}/lib) + +set(CMAKE_CXX_FLAGS "-Wall -Werror --std=c++0x") +set(CMAKE_CXX_FLAGS_DEBUG "-g -O0 -DDEBUG") +set(CMAKE_CXX_FLAGS_RELEASE "-fno-strict-aliasing -O3 -DNDEBUG") + +# Set OS +if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin") + add_definitions(-DOS_MACOSX) +endif() + +if(${CMAKE_SYSTEM_NAME} MATCHES "Linux") + add_definitions(-DOS_LINUX) +endif() + +if(${CMAKE_SYSTEM_NAME} MATCHES "Windows") + add_definitions(-DOS_WINDOWS) +endif() + +# Set CPU +if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86") + add_definitions(-DARCH_X86) +endif() + +# Set up directories +include_directories(${PROJECT_SOURCE_DIR}/include) +include_directories(${PROJECT_SOURCE_DIR}/src) +link_directories(${PROJECT_SOURCE_DIR}/lib) + +# Build the targets +FILE(GLOB SOURCE_FILES "src/*.cc") +add_library(benchmark STATIC ${SOURCE_FILES}) + +add_executable(benchmark_test test/benchmark_test.cc) +target_link_libraries(benchmark_test benchmark ${CMAKE_THREAD_LIBS_INIT}) + diff --git a/include/benchmark/benchmark.h b/include/benchmark/benchmark.h new file mode 100644 index 00000000..d5dec23c --- /dev/null +++ b/include/benchmark/benchmark.h @@ -0,0 +1,540 @@ +// Support for registering benchmarks for functions. + +/* Example usage: +// Define a function that executes the code to be measured a +// specified number of times: +static void BM_StringCreation(benchmark::State& state) { + while (state.KeepRunning()) + std::string empty_string; +} + +// Register the function as a benchmark +BENCHMARK(BM_StringCreation); + +// Define another benchmark +static void BM_StringCopy(benchmark::State& state) { + std::string x = "hello"; + while (state.KeepRunning()) + std::string copy(x); +} +BENCHMARK(BM_StringCopy); + +// Augment the main() program to invoke benchmarks if specified +// via the --benchmarks command line flag. E.g., +// my_unittest --benchmarks=all +// my_unittest --benchmarks=BM_StringCreation +// my_unittest --benchmarks=String +// my_unittest --benchmarks='Copy|Creation' +int main(int argc, char** argv) { + Initialize(&argc, argv); + + RunSpecifiedBenchmarks(); +} + +// Sometimes a family of microbenchmarks can be implemented with +// just one routine that takes an extra argument to specify which +// one of the family of benchmarks to run. For example, the following +// code defines a family of microbenchmarks for measuring the speed +// of memcpy() calls of different lengths: + +static void BM_memcpy(benchmark::State& state) { + char* src = new char[state.range_x()]; char* dst = new char[state.range_x()]; + memset(src, 'x', state.range_x()); + while (state.KeepRunning()) { + memcpy(dst, src, state.range_x()); + SetBenchmarkBytesProcessed(int64_t_t(state.iterations) * int64(state.range_x())); + delete[] src; delete[] dst; +} +BENCHMARK(BM_memcpy)->Arg(8)->Arg(64)->Arg(512)->Arg(1<<10)->Arg(8<<10); + +// The preceding code is quite repetitive, and can be replaced with the +// following short-hand. The following invocation will pick a few +// appropriate arguments in the specified range and will generate a +// microbenchmark for each such argument. +BENCHMARK(BM_memcpy)->Range(8, 8<<10); + +// You might have a microbenchmark that depends on two inputs. For +// example, the following code defines a family of microbenchmarks for +// measuring the speed of set insertion. +static void BM_SetInsert(benchmark::State& state) { + while (state.KeepRunning()) { + state.PauseTiming(); + set data = ConstructRandomSet(state.range_x()); + state.ResumeTiming(); + for (int j = 0; j < state.rangeY; ++j) + data.insert(RandomNumber()); + } +} +BENCHMARK(BM_SetInsert) + ->ArgPair(1<<10, 1) + ->ArgPair(1<<10, 8) + ->ArgPair(1<<10, 64) + ->ArgPair(1<<10, 512) + ->ArgPair(8<<10, 1) + ->ArgPair(8<<10, 8) + ->ArgPair(8<<10, 64) + ->ArgPair(8<<10, 512); + +// The preceding code is quite repetitive, and can be replaced with +// the following short-hand. The following macro will pick a few +// appropriate arguments in the product of the two specified ranges +// and will generate a microbenchmark for each such pair. +BENCHMARK(BM_SetInsert)->RangePair(1<<10, 8<<10, 1, 512); + +// For more complex patterns of inputs, passing a custom function +// to Apply allows programmatic specification of an +// arbitrary set of arguments to run the microbenchmark on. +// The following example enumerates a dense range on +// one parameter, and a sparse range on the second. +static benchmark::internal::Benchmark* CustomArguments( + benchmark::internal::Benchmark* b) { + for (int i = 0; i <= 10; ++i) + for (int j = 32; j <= 1024*1024; j *= 8) + b = b->ArgPair(i, j); + return b; +} +BENCHMARK(BM_SetInsert)->Apply(CustomArguments); + +// Templated microbenchmarks work the same way: +// Produce then consume 'size' messages 'iters' times +// Measures throughput in the absence of multiprogramming. +template int BM_Sequential(benchmark::State& state) { + Q q; + typename Q::value_type v; + while (state.KeepRunning()) { + for (int i = state.range_x(); i--; ) + q.push(v); + for (int e = state.range_x(); e--; ) + q.Wait(&v); + } + // actually messages, not bytes: + state.SetBytesProcessed( + static_cast(state.iterations())*state.range_x()); +} +BENCHMARK_TEMPLATE(BM_Sequential, WaitQueue)->Range(1<<0, 1<<10); + +In a multithreaded test, it is guaranteed that none of the threads will start +until all have called KeepRunning, and all will have finished before KeepRunning +returns false. As such, any global setup or teardown you want to do can be +wrapped in a check against the thread index: + +static void BM_MultiThreaded(benchmark::State& state) { + if (state.thread_index == 0) { + // Setup code here. + } + while (state.KeepRunning()) { + // Run the test as normal. + } + if (state.thread_index == 0) { + // Teardown code here. + } +} +*/ + +#ifndef BENCHMARK_BENCHMARK_H_ +#define BENCHMARK_BENCHMARK_H_ + +#include + +#include +#include +#include + +#include "macros.h" + +namespace benchmark { +// If the --benchmarks flag is empty, do nothing. +// +// Otherwise, run all benchmarks specified by the --benchmarks flag, +// and exit after running the benchmarks. +extern void RunSpecifiedBenchmarks(); + +// ------------------------------------------------------ +// Routines that can be called from within a benchmark + +// +// REQUIRES: a benchmark is currently executing +extern void SetLabel(const std::string& label); + +// If this routine is called, peak memory allocation past this point in the +// benchmark is reported at the end of the benchmark report line. (It is +// computed by running the benchmark once with a single iteration and a memory +// tracer.) +extern void MemoryUsage(); + +// If a particular benchmark is I/O bound, or if for some reason CPU +// timings are not representative, call this method from within the +// benchmark routine. If called, the elapsed time will be used to +// control how many iterations are run, and in the printing of +// items/second or MB/seconds values. If not called, the cpu time +// used by the benchmark will be used. +extern void UseRealTime(); + +namespace internal { +class Benchmark; +} + +// State is passed to a running Benchmark and contains state for the +// benchmark to use. +class State { + public: + // Returns true iff the benchmark should continue through another iteration. + bool KeepRunning(); + + void PauseTiming(); + void ResumeTiming(); + + // Set the number of bytes processed by the current benchmark + // execution. This routine is typically called once at the end of a + // throughput oriented benchmark. If this routine is called with a + // value > 0, the report is printed in MB/sec instead of nanoseconds + // per iteration. + // + // REQUIRES: a benchmark has exited its KeepRunning loop. + void SetBytesProcessed(int64_t bytes); + + // If this routine is called with items > 0, then an items/s + // label is printed on the benchmark report line for the currently + // executing benchmark. It is typically called at the end of a processing + // benchmark where a processing items/second output is desired. + // + // REQUIRES: a benchmark has exited its KeepRunning loop. + void SetItemsProcessed(int64_t items); + + // If this routine is called, the specified label is printed at the + // end of the benchmark report line for the currently executing + // benchmark. Example: + // static void BM_Compress(int iters) { + // ... + // double compress = input_size / output_size; + // benchmark::SetLabel(StringPrintf("compress:%.1f%%", 100.0*compression)); + // } + // Produces output that looks like: + // BM_Compress 50 50 14115038 compress:27.3% + // + // REQUIRES: a benchmark has exited its KeepRunning loop. + void SetLabel(const std::string& label); + + // Range arguments for this run. CHECKs if the argument has been set. + int range_x() const; + int range_y() const; + + int iterations() const { return total_iterations_; } + + const int thread_index; + + private: + class FastClock; + struct SharedState; + + State(FastClock* clock, SharedState* s, int t); + bool StartRunning(); + bool FinishInterval(); + bool MaybeStop(); + void NewInterval(); + bool AllStarting(); + bool RunAnotherInterval() const; + + void Run(); + + enum EState { + STATE_INITIAL, // KeepRunning hasn't been called + STATE_STARTING, // KeepRunning called, waiting for other threads + STATE_RUNNING, // Running and being timed + STATE_STOPPING, // Not being timed but waiting for other threads + STATE_STOPPED, // Stopped + } state_; + + FastClock* clock_; + + // State shared by all BenchmarkRun objects that belong to the same + // BenchmarkInstance + SharedState* shared_; + + // Custom label set by the user. + std::string label_; + + // Each State object goes through a sequence of measurement intervals. By + // default each interval is approx. 100ms in length. The following stats are + // kept for each interval. + int64_t iterations_; + double start_cpu_; + double start_time_; + int64_t stop_time_micros_; + + double start_pause_; + double pause_time_; + + // Total number of iterations for all finished runs. + int64_t total_iterations_; + + // Approximate time in microseconds for one interval of execution. + // Dynamically adjusted as needed. + int64_t interval_micros_; + + // True if the current interval is the continuation of a previous one. + bool is_continuation_; + + friend class internal::Benchmark; + DISALLOW_COPY_AND_ASSIGN(State); +}; + +namespace internal { +class BenchmarkReporter; + +typedef std::function BenchmarkFunction; + +// Run all benchmarks whose name is a partial match for the regular +// expression in "spec". The results of benchmark runs are fed to "reporter". +void RunMatchingBenchmarks(const std::string& spec, + BenchmarkReporter* reporter); + +// Extract the list of benchmark names that match the specified regular +// expression. +void FindMatchingBenchmarkNames(const std::string& re, + std::vector* benchmark_names); + +// ------------------------------------------------------ +// Benchmark registration object. The BENCHMARK() macro expands +// into an internal::Benchmark* object. Various methods can +// be called on this object to change the properties of the benchmark. +// Each method returns "this" so that multiple method calls can +// chained into one expression. +class Benchmark { + public: + // The Benchmark takes ownership of the Callback pointed to by f. + Benchmark(const char* name, BenchmarkFunction f); + + ~Benchmark(); + + // Note: the following methods all return "this" so that multiple + // method calls can be chained together in one expression. + + // Run this benchmark once with "x" as the extra argument passed + // to the function. + // REQUIRES: The function passed to the constructor must accept an arg1. + Benchmark* Arg(int x); + + // Run this benchmark once for a number of values picked from the + // range [start..limit]. (start and limit are always picked.) + // REQUIRES: The function passed to the constructor must accept an arg1. + Benchmark* Range(int start, int limit); + + // Run this benchmark once for every value in the range [start..limit] + // REQUIRES: The function passed to the constructor must accept an arg1. + Benchmark* DenseRange(int start, int limit); + + // Run this benchmark once with "x,y" as the extra arguments passed + // to the function. + // REQUIRES: The function passed to the constructor must accept arg1,arg2. + Benchmark* ArgPair(int x, int y); + + // Pick a set of values A from the range [lo1..hi1] and a set + // of values B from the range [lo2..hi2]. Run the benchmark for + // every pair of values in the cartesian product of A and B + // (i.e., for all combinations of the values in A and B). + // REQUIRES: The function passed to the constructor must accept arg1,arg2. + Benchmark* RangePair(int lo1, int hi1, int lo2, int hi2); + + // Pass this benchmark object to *func, which can customize + // the benchmark by calling various methods like Arg, ArgPair, + // Threads, etc. + Benchmark* Apply(void (*func)(Benchmark* benchmark)); + + // Support for running multiple copies of the same benchmark concurrently + // in multiple threads. This may be useful when measuring the scaling + // of some piece of code. + + // Run one instance of this benchmark concurrently in t threads. + Benchmark* Threads(int t); + + // Pick a set of values T from [min_threads,max_threads]. + // min_threads and max_threads are always included in T. Run this + // benchmark once for each value in T. The benchmark run for a + // particular value t consists of t threads running the benchmark + // function concurrently. For example, consider: + // BENCHMARK(Foo)->ThreadRange(1,16); + // This will run the following benchmarks: + // Foo in 1 thread + // Foo in 2 threads + // Foo in 4 threads + // Foo in 8 threads + // Foo in 16 threads + Benchmark* ThreadRange(int min_threads, int max_threads); + + // Equivalent to ThreadRange(NumCPUs(), NumCPUs()) + Benchmark* ThreadPerCpu(); + + // TODO(dominich): Control whether or not real-time is used for this benchmark + // TODO(dominich): Control the default number of iterations + + // ------------------------------- + // Following methods are not useful for clients + + // Used inside the benchmark implementation + struct Instance; + struct ThreadStats; + + // Extract the list of benchmark instances that match the specified + // regular expression. + static void FindBenchmarks(const std::string& re, + std::vector* benchmarks); + + // Measure the overhead of an empty benchmark to subtract later. + static void MeasureOverhead(); + + private: + std::vector CreateBenchmarkInstances(int rangeXindex, + int rangeYindex); + + std::string name_; + BenchmarkFunction function_; + int registration_index_; + std::vector rangeX_; + std::vector rangeY_; + std::vector thread_counts_; + + // Special value placed in thread_counts_ to stand for NumCPUs() + static const int kNumCpuMarker = -1; + + // Special value used to indicate that no range is required. + static const int kNoRange = -1; + + static void AddRange(std::vector* dst, int lo, int hi, int mult); + static double MeasurePeakHeapMemory(const Instance& b); + static void RunInstance(const Instance& b, BenchmarkReporter* br); + friend class ::benchmark::State; + friend struct ::benchmark::internal::Benchmark::Instance; + friend void ::benchmark::internal::RunMatchingBenchmarks( + const std::string&, BenchmarkReporter*); + DISALLOW_COPY_AND_ASSIGN(Benchmark); +}; + +// ------------------------------------------------------ +// Benchmarks reporter interface + data containers. + +struct BenchmarkContextData { + int num_cpus; + double mhz_per_cpu; + //std::string cpu_info; + bool cpu_scaling_enabled; + + // The number of chars in the longest benchmark name. + int name_field_width; +}; + +struct BenchmarkRunData { + BenchmarkRunData() : + thread_index(-1), + iterations(1), + real_accumulated_time(0), + cpu_accumulated_time(0), + bytes_per_second(0), + items_per_second(0), + max_heapbytes_used(0) {} + + std::string benchmark_name; + std::string report_label; + int thread_index; + int64_t iterations; + double real_accumulated_time; + double cpu_accumulated_time; + + // Zero if not set by benchmark. + double bytes_per_second; + double items_per_second; + + // This is set to 0.0 if memory tracing is not enabled. + double max_heapbytes_used; +}; + +// Interface for custom benchmark result printers. +// By default, benchmark reports are printed to stdout. However an application +// can control the destination of the reports by calling +// RunMatchingBenchmarks and passing it a custom reporter object. +// The reporter object must implement the following interface. +class BenchmarkReporter { + public: + // Called once for every suite of benchmarks run. + // The parameter "context" contains information that the + // reporter may wish to use when generating its report, for example the + // platform under which the benchmarks are running. The benchmark run is + // never started if this function returns false, allowing the reporter + // to skip runs based on the context information. + virtual bool ReportContext(const BenchmarkContextData& context) = 0; + + // Called once for each group of benchmark runs, gives information about + // cpu-time and heap memory usage during the benchmark run. + // Note that all the grouped benchmark runs should refer to the same + // benchmark, thus have the same name. + virtual void ReportRuns(const std::vector& report) = 0; + + virtual ~BenchmarkReporter(); +}; + + +// ------------------------------------------------------ +// Internal implementation details follow; please ignore + +// Given a collection of reports, computes their mean and stddev. +// REQUIRES: all runs in "reports" must be from the same benchmark. +void ComputeStats(const std::vector& reports, + BenchmarkRunData* mean_data, + BenchmarkRunData* stddev_data); + +// Simple reporter that outputs benchmark data to the console. This is the +// default reporter used by RunSpecifiedBenchmarks(). +class ConsoleReporter : public BenchmarkReporter { + public: + virtual bool ReportContext(const BenchmarkContextData& context); + virtual void ReportRuns(const std::vector& reports); + private: + std::string PrintMemoryUsage(double bytes); + virtual void PrintRunData(const BenchmarkRunData& report); + int name_field_width_; +}; + +} // end namespace internal + +void Initialize(int* argc, const char** argv); +} // end namespace benchmark + +// ------------------------------------------------------ +// Macro to register benchmarks + +// Helpers for generating unique variable names +#define BENCHMARK_CONCAT(a, b, c) BENCHMARK_CONCAT2(a, b, c) +#define BENCHMARK_CONCAT2(a, b, c) a ## b ## c + +#define BENCHMARK(n) \ + static ::benchmark::internal::Benchmark* \ + BENCHMARK_CONCAT(__benchmark_, n, __LINE__) ATTRIBUTE_UNUSED = \ + (new ::benchmark::internal::Benchmark(#n, n)) + +// Old-style macros +#define BENCHMARK_WITH_ARG(n, a) BENCHMARK(n)->Arg((a)) +#define BENCHMARK_WITH_ARG2(n, a1, a2) BENCHMARK(n)->ArgPair((a1), (a2)) +#define BENCHMARK_RANGE(n, lo, hi) BENCHMARK(n)->Range((lo), (hi)) +#define BENCHMARK_RANGE2(n, l1, h1, l2, h2) \ + BENCHMARK(n)->RangePair((l1), (h1), (l2), (h2)) + +// This will register a benchmark for a templatized function. For example: +// +// template +// void BM_Foo(int iters); +// +// BENCHMARK_TEMPLATE(BM_Foo, 1); +// +// will register BM_Foo<1> as a benchmark. +#define BENCHMARK_TEMPLATE(n, a) \ + static ::benchmark::internal::Benchmark* \ + BENCHMARK_CONCAT(__benchmark_, n, __LINE__) ATTRIBUTE_UNUSED = \ + (new ::benchmark::internal::Benchmark(#n "<" #a ">", n)) + +#define BENCHMARK_TEMPLATE2(n, a, b) \ + static ::benchmark::internal::Benchmark* \ + BENCHMARK_CONCAT(__benchmark_, n, __LINE__) ATTRIBUTE_UNUSED = \ + (new ::benchmark::internal::Benchmark(#n "<" #a "," #b ">", n)) + +#endif // BENCHMARK_BENCHMARK_H_ + diff --git a/include/benchmark/macros.h b/include/benchmark/macros.h new file mode 100644 index 00000000..8c2df946 --- /dev/null +++ b/include/benchmark/macros.h @@ -0,0 +1,120 @@ +#ifndef BENCHMARK_MACROS_H_ +#define BENCHMARK_MACROS_H_ + +#include + +#define DISALLOW_COPY_AND_ASSIGN(TypeName) \ + TypeName(const TypeName&); \ + void operator=(const TypeName&); + +// The arraysize(arr) macro returns the # of elements in an array arr. +// The expression is a compile-time constant, and therefore can be +// used in defining new arrays, for example. If you use arraysize on +// a pointer by mistake, you will get a compile-time error. +// +// One caveat is that, for C++03, arraysize() doesn't accept any array of +// an anonymous type or a type defined inside a function. In these rare +// cases, you have to use the unsafe ARRAYSIZE() macro below. This is +// due to a limitation in C++03's template system. The limitation has +// been removed in C++11. + +// This template function declaration is used in defining arraysize. +// Note that the function doesn't need an implementation, as we only +// use its type. +template +char (&ArraySizeHelper(T (&array)[N]))[N]; + +// That gcc wants both of these prototypes seems mysterious. VC, for +// its part, can't decide which to use (another mystery). Matching of +// template overloads: the final frontier. +#ifndef COMPILER_MSVC +template +char (&ArraySizeHelper(const T (&array)[N]))[N]; +#endif + +#define arraysize(array) (sizeof(ArraySizeHelper(array))) + +// The STATIC_ASSERT macro can be used to verify that a compile time +// expression is true. For example, you could use it to verify the +// size of a static array: +// +// STATIC_ASSERT(ARRAYSIZE(content_type_names) == CONTENT_NUM_TYPES, +// content_type_names_incorrect_size); +// +// or to make sure a struct is smaller than a certain size: +// +// STATIC_ASSERT(sizeof(foo) < 128, foo_too_large); +// +// The second argument to the macro is the name of the variable. If +// the expression is false, most compilers will issue a warning/error +// containing the name of the variable. + +template +struct StaticAssert { +}; + +#define STATIC_ASSERT(expr, msg) \ + typedef StaticAssert<(bool(expr))> msg[bool(expr) ? 1 : -1] + +// Implementation details of STATIC_ASSERT: +// +// - STATIC_ASSERT works by defining an array type that has -1 +// elements (and thus is invalid) when the expression is false. +// +// - The simpler definition +// +// #define STATIC_ASSERT(expr, msg) typedef char msg[(expr) ? 1 : -1] +// +// does not work, as gcc supports variable-length arrays whose sizes +// are determined at run-time (this is gcc's extension and not part +// of the C++ standard). As a result, gcc fails to reject the +// following code with the simple definition: +// +// int foo; +// STATIC_ASSERT(foo, msg); // not supposed to compile as foo is +// // not a compile-time constant. +// +// - By using the type StaticAssert<(bool(expr))>, we ensures that +// expr is a compile-time constant. (Template arguments must be +// determined at compile-time.) +// +// - The outer parentheses in StaticAssert<(bool(expr))> are necessary +// to work around a bug in gcc 3.4.4 and 4.0.1. If we had written +// +// StaticAssert +// +// instead, these compilers will refuse to compile +// +// STATIC_ASSERT(5 > 0, some_message); +// +// (They seem to think the ">" in "5 > 0" marks the end of the +// template argument list.) +// +// - The array size is (bool(expr) ? 1 : -1), instead of simply +// +// ((expr) ? 1 : -1). +// +// This is to avoid running into a bug in MS VC 7.1, which +// causes ((0.0) ? 1 : -1) to incorrectly evaluate to 1. + +#define CHECK(b) do { if (!(b)) assert(false); } while(0) +#define CHECK_EQ(a, b) CHECK((a) == (b)) +#define CHECK_GE(a, b) CHECK((a) >= (b)) +#define CHECK_LE(a, b) CHECK((a) <= (b)) +#define CHECK_GT(a, b) CHECK((a) > (b)) +#define CHECK_LT(a, b) CHECK((a) < (b)) + +// +// Prevent the compiler from complaining about or optimizing away variables +// that appear unused. +#define ATTRIBUTE_UNUSED __attribute__ ((unused)) + +// +// For functions we want to force inline or not inline. +// Introduced in gcc 3.1. +#define ATTRIBUTE_ALWAYS_INLINE __attribute__ ((always_inline)) +#define HAVE_ATTRIBUTE_ALWAYS_INLINE 1 +#define ATTRIBUTE_NOINLINE __attribute__ ((noinline)) +#define HAVE_ATTRIBUTE_NOINLINE 1 + +#endif // BENCHMARK_MACROS_H_ diff --git a/src/benchmark.cc b/src/benchmark.cc new file mode 100644 index 00000000..7a4de8e3 --- /dev/null +++ b/src/benchmark.cc @@ -0,0 +1,1197 @@ +#include "benchmark/benchmark.h" +#include "benchmark/macros.h" +#include "colorprint.h" +#include "commandlineflags.h" +#include "mutex_lock.h" +#include "sleep.h" +#include "stat.h" +#include "sysinfo.h" +#include "walltime.h" + +#include +#include +#include + +#if defined OS_FREEBSD +#include +#else +#include +#endif + +#include +#include +#include +#include + +DEFINE_string(benchmark_filter, ".", + "A regular expression that specifies the set of benchmarks " + "to execute. If this flag is empty, no benchmarks are run. " + "If this flag is the string \"all\", all benchmarks linked " + "into the process are run."); + +DEFINE_int32(benchmark_min_iters, 100, + "Minimum number of iterations per benchmark"); + +DEFINE_int32(benchmark_max_iters, 1000000000, + "Maximum number of iterations per benchmark"); + +DEFINE_double(benchmark_min_time, 0.5, + "Minimum number of seconds we should run benchmark before " + "results are considered significant. For cpu-time based " + "tests, this is the lower bound on the total cpu time " + "used by all threads that make up the test. For real-time " + "based tests, this is the lower bound on the elapsed time " + "of the benchmark execution, regardless of number of " + "threads."); + +DEFINE_bool(benchmark_memory_usage, false, + "Report memory usage for all benchmarks"); + +DEFINE_int32(benchmark_repetitions, 1, + "The number of runs of each benchmark. If greater than 1, the " + "mean and standard deviation of the runs will be reported."); + +DEFINE_int32(v, 0, "The level of verbose logging to output"); +DEFINE_bool(color_print, true, "Enables colorized logging."); + +// Will be non-empty if heap checking is turned on, which would +// invalidate any benchmarks. +DECLARE_string(heap_check); + +// The ""'s catch people who don't pass in a literal for "str" +#define strliterallen(str) (sizeof("" str "")-1) + +// Must use a string literal for prefix. +#define memprefix(str, len, prefix) \ + ( (((len) >= strliterallen(prefix)) \ + && memcmp(str, prefix, strliterallen(prefix)) == 0) \ + ? str + strliterallen(prefix) \ + : NULL ) + +namespace benchmark { + +namespace { + +// kilo, Mega, Giga, Tera, Peta, Exa, Zetta, Yotta. +static const char kBigSIUnits[] = "kMGTPEZY"; +// Kibi, Mebi, Gibi, Tebi, Pebi, Exbi, Zebi, Yobi. +static const char kBigIECUnits[] = "KMGTPEZY"; +// milli, micro, nano, pico, femto, atto, zepto, yocto. +static const char kSmallSIUnits[] = "munpfazy"; + +// We require that all three arrays have the same size. +STATIC_ASSERT(arraysize(kBigSIUnits) == arraysize(kBigIECUnits), + SI_and_IEC_unit_arrays_must_be_the_same_size); +STATIC_ASSERT(arraysize(kSmallSIUnits) == arraysize(kBigSIUnits), + Small_SI_and_Big_SI_unit_arrays_must_be_the_same_size); +static const int kUnitsSize = arraysize(kBigSIUnits); + +void ToExponentAndMantissa(double val, double thresh, + int precision, double one_k, + std::string* mantissa, int* exponent) { + std::stringstream mantissa_stream; + + if (val < 0) { + mantissa_stream << "-"; + val = -val; + } + + // Adjust threshold so that it never excludes things which can't be rendered + // in 'precision' digits. + const double adjusted_threshold = + std::max(thresh, 1.0 / pow(10.0, precision)); + const double big_threshold = adjusted_threshold * one_k; + const double small_threshold = adjusted_threshold; + + if (val > big_threshold) { + // Positive powers + double scaled = val; + for (size_t i = 0; i < arraysize(kBigSIUnits); ++i) { + scaled /= one_k; + if (scaled <= big_threshold) { + mantissa_stream << scaled; + *exponent = i + 1; + *mantissa = mantissa_stream.str(); + return; + } + } + mantissa_stream << val; + *exponent = 0; + } else if (val < small_threshold) { + // Negative powers + double scaled = val; + for (size_t i = 0; i < arraysize(kSmallSIUnits); ++i) { + scaled *= one_k; + if (scaled >= small_threshold) { + mantissa_stream << scaled; + *exponent = -i - 1; + *mantissa = mantissa_stream.str(); + return; + } + } + mantissa_stream << val; + *exponent = 0; + } else { + mantissa_stream << val; + *exponent = 0; + } + *mantissa = mantissa_stream.str(); +} + +std::string ExponentToPrefix(int exponent, bool iec) { + if (exponent == 0) + return ""; + + const int index = (exponent > 0 ? exponent - 1 : -exponent - 1); + if (index >= kUnitsSize) + return ""; + + const char *array = (exponent > 0 ? (iec ? kBigIECUnits : kBigSIUnits) : + kSmallSIUnits); + if (iec) + return array[index] + std::string("i"); + else + return std::string(1, array[index]); +} + +std::string ToBinaryStringFullySpecified(double value, double threshold, + int precision) { + std::string mantissa; + int exponent; + ToExponentAndMantissa(value, threshold, precision, 1024., &mantissa, + &exponent); + return mantissa + ExponentToPrefix(exponent, false); +} + +inline void AppendHumanReadable(int n, std::string* str) { + std::stringstream ss; + // Round down to the nearest SI prefix. + ss << "/" << ToBinaryStringFullySpecified(n, 1.0, 0); + *str += ss.str(); +} + +inline std::string HumanReadableNumber(double n) { + // 1.1 means that figures up to 1.1k should be shown with the next unit down; + // this softens edge effects. + // 1 means that we should show one decimal place of precision. + return ToBinaryStringFullySpecified(n, 1.1, 1); +} + +} // end namespace + +namespace internal { +struct Benchmark::ThreadStats { + int64_t bytes_processed; + int64_t items_processed; + + ThreadStats() { Reset(); } + + void Reset() { + bytes_processed = 0; + items_processed = 0; + } + + void Add(const ThreadStats& other) { + bytes_processed += other.bytes_processed; + items_processed += other.items_processed; + } +}; + +} // end namespace internal + +namespace { + +// Per-thread stats +pthread_key_t thread_stats_key; +internal::Benchmark::ThreadStats* thread_stats = nullptr; + +// For non-dense Range, intermediate values are powers of kRangeMultiplier. +static const int kRangeMultiplier = 8; + +// List of all registered benchmarks. Note that each registered +// benchmark identifies a family of related benchmarks to run. +static pthread_mutex_t benchmark_mutex; +static std::vector* families = NULL; + +bool running_benchmark = false; + +// Should this benchmark report memory usage? +bool get_memory_usage; + +// Should this benchmark base decisions off of real time rather than +// cpu time? +bool use_real_time; + +// Overhead of an empty benchmark. +double overhead = 0.0; + +void DeleteThreadStats(void* p) { + delete (internal::Benchmark::ThreadStats*) p; +} + +// Return prefix to print in front of each reported line +const char* Prefix() { +#ifdef NDEBUG + return ""; +#else + return "DEBUG: "; +#endif +} + +// TODO +//static internal::MallocCounter *benchmark_mc; + +static bool CpuScalingEnabled() { + // On Linux, the CPUfreq subsystem exposes CPU information as files on the + // local file system. If reading the exported files fails, then we may not be + // running on Linux, so we silently ignore all the read errors. + for (int cpu = 0, num_cpus = NumCPUs(); cpu < num_cpus; ++cpu) { + std::stringstream ss; + ss << "/sys/devices/system/cpu/cpu" << cpu << "/cpufreq/scaling_governor"; + std::string governor_file = ss.str(); + FILE* file = fopen(governor_file.c_str(), "r"); + if (!file) + break; + char buff[16]; + size_t bytes_read = fread(buff, 1, sizeof(buff), file); + fclose(file); + if (memprefix(buff, bytes_read, "performance") == NULL) + return true; + } + return false; +} + +} // namespace + +namespace internal { + +BenchmarkReporter::~BenchmarkReporter() {} + +void ComputeStats(const std::vector& reports, + BenchmarkRunData* mean_data, + BenchmarkRunData* stddev_data) { + // Accumulators. + Stat1_d real_accumulated_time_stat; + Stat1_d cpu_accumulated_time_stat; + Stat1_d bytes_per_second_stat; + Stat1_d items_per_second_stat; + Stat1MinMax_d max_heapbytes_used_stat; + int total_iters = 0; + + // Populate the accumulators. + for (std::vector::const_iterator it = reports.begin(); + it != reports.end(); ++it) { + CHECK_EQ(reports[0].benchmark_name, it->benchmark_name); + total_iters += it->iterations; + real_accumulated_time_stat += + Stat1_d(it->real_accumulated_time/it->iterations, it->iterations); + cpu_accumulated_time_stat += + Stat1_d(it->cpu_accumulated_time/it->iterations, it->iterations); + items_per_second_stat += Stat1_d(it->items_per_second, it->iterations); + bytes_per_second_stat += Stat1_d(it->bytes_per_second, it->iterations); + max_heapbytes_used_stat += Stat1MinMax_d(it->max_heapbytes_used, + it->iterations); + } + + // Get the data from the accumulator to BenchmarkRunData's. + mean_data->benchmark_name = reports[0].benchmark_name + "_mean"; + mean_data->iterations = total_iters; + mean_data->real_accumulated_time = real_accumulated_time_stat.Sum(); + mean_data->cpu_accumulated_time = cpu_accumulated_time_stat.Sum(); + mean_data->bytes_per_second = bytes_per_second_stat.Mean(); + mean_data->items_per_second = items_per_second_stat.Mean(); + mean_data->max_heapbytes_used = max_heapbytes_used_stat.Max(); + + // Only add label to mean/stddev if it is same for all runs + mean_data->report_label = reports[0].report_label; + for (size_t i = 1; i < reports.size(); i++) { + if (reports[i].report_label != reports[0].report_label) { + mean_data->report_label = ""; + break; + } + } + + stddev_data->benchmark_name = reports[0].benchmark_name + "_stddev"; + stddev_data->report_label = mean_data->report_label; + stddev_data->iterations = total_iters; + // We multiply by total_iters since PrintRunData expects a total time. + stddev_data->real_accumulated_time = + real_accumulated_time_stat.StdDev() * total_iters; + stddev_data->cpu_accumulated_time = + cpu_accumulated_time_stat.StdDev() * total_iters; + stddev_data->bytes_per_second = bytes_per_second_stat.StdDev(); + stddev_data->items_per_second = items_per_second_stat.StdDev(); + stddev_data->max_heapbytes_used = max_heapbytes_used_stat.StdDev(); +} + +std::string ConsoleReporter::PrintMemoryUsage(double bytes) { + if (!get_memory_usage || bytes < 0.0) + return ""; + + std::stringstream ss; + ss << " " << HumanReadableNumber(bytes) << "B peak-mem"; + return ss.str(); +} + +bool ConsoleReporter::ReportContext(const BenchmarkContextData& context) { + name_field_width_ = context.name_field_width; + + std::cout << "Benchmarking on " << context.num_cpus << " X " + << context.mhz_per_cpu << " MHz CPU" + << ((context.num_cpus > 1) ? "s" : "") << "\n"; + + int remainder_ms; + char time_buf[32]; + std::cout << walltime::Print(walltime::Now(), "%Y/%m/%d-%H:%M:%S", + true, // use local timezone + time_buf, &remainder_ms) << "\n"; + + // Show details of CPU model, caches, TLBs etc. +// if (!context.cpu_info.empty()) +// std::cout << "CPU: " << context.cpu_info.c_str(); + + if (context.cpu_scaling_enabled) { + std::cerr << "CPU scaling is enabled: Benchmark timings may be noisy.\n"; + } + + int output_width = fprintf(stdout, "%s%-*s %10s %10s %10s\n", + Prefix(), name_field_width_, "Benchmark", + "Time(ns)", "CPU(ns)", "Iterations"); + std::cout << std::string(output_width - 1, '-').c_str() << "\n"; + + return true; +} + +void ConsoleReporter::ReportRuns(const std::vector& reports) { + for (std::vector::const_iterator it = reports.begin(); + it != reports.end(); ++it) { + CHECK_EQ(reports[0].benchmark_name, it->benchmark_name); + PrintRunData(*it); + } + + // We don't report aggregated data if there was a single run. + if (reports.size() < 2) + return; + + BenchmarkRunData mean_data; + BenchmarkRunData stddev_data; + internal::ComputeStats(reports, &mean_data, &stddev_data); + + // Output using PrintRun. + PrintRunData(mean_data); + PrintRunData(stddev_data); + fprintf(stdout, "\n"); +} + +void ConsoleReporter::PrintRunData(const BenchmarkRunData& result) { + // Format bytes per second + std::string rate; + if (result.bytes_per_second > 0) { + std::stringstream ss; + ss << " " << HumanReadableNumber(result.bytes_per_second) << "B/s"; + rate = ss.str(); + } + + // Format items per second + std::string items; + if (result.items_per_second > 0) { + std::stringstream ss; + ss << " " << HumanReadableNumber(result.items_per_second) << " items/s"; + items = ss.str(); + } + + ColorPrintf(COLOR_DEFAULT, "%s", Prefix()); + ColorPrintf(COLOR_GREEN, "%-*s ", + name_field_width_, result.benchmark_name.c_str()); + ColorPrintf(COLOR_YELLOW, "%10.0f %10.0f ", + (result.real_accumulated_time * 1e9) / + (static_cast(result.iterations)), + (result.cpu_accumulated_time * 1e9) / + (static_cast(result.iterations))); + ColorPrintf(COLOR_CYAN, "%10lld", result.iterations); + ColorPrintf(COLOR_DEFAULT, "%*s %s %s%s\n", 16, rate.c_str(), items.c_str(), + result.report_label.c_str(), + PrintMemoryUsage(result.max_heapbytes_used).c_str()); +} + +void MemoryUsage() { + //if (benchmark_mc) { + // benchmark_mc->Reset(); + //} else { + get_memory_usage = true; + //} +} + +void UseRealTime() { + use_real_time = true; +} + +void PrintUsageAndExit() { + fprintf(stdout, "benchmark [--benchmark_filter=]\n" + " [--benchmark_min_iters=]\n" + " [--benchmark_max_iters=]\n" + " [--benchmark_min_time=]\n" +// " [--benchmark_memory_usage]\n" + " [--benchmark_repetitions=]\n" + " [--color_print={true|false}]\n" + " [--v=]\n"); + exit(0); +} + +void ParseCommandLineFlags(int* argc, const char** argv) { + for (int i = 1; i < *argc; ++i) { + if (ParseStringFlag(argv[i], "benchmark_filter", + &FLAGS_benchmark_filter) || + ParseInt32Flag(argv[i], "benchmark_min_iters", + &FLAGS_benchmark_min_iters) || + ParseInt32Flag(argv[i], "benchmark_max_iters", + &FLAGS_benchmark_max_iters) || + ParseDoubleFlag(argv[i], "benchmark_min_time", + &FLAGS_benchmark_min_time) || + // TODO(dominic) +// ParseBoolFlag(argv[i], "gbenchmark_memory_usage", +// &FLAGS_gbenchmark_memory_usage) || + ParseInt32Flag(argv[i], "benchmark_repetitions", + &FLAGS_benchmark_repetitions) || + ParseBoolFlag(argv[i], "color_print", &FLAGS_color_print) || + ParseInt32Flag(argv[i], "v", &FLAGS_v)) { + for (int j = i; j != *argc; ++j) + argv[j] = argv[j + 1]; + + --(*argc); + --i; + } else if (IsFlag(argv[i], "help")) + PrintUsageAndExit(); + } +} + +} // end namespace internal + +// A clock that provides a fast mechanism to check if we're nearly done. +class State::FastClock { + public: + enum Type { REAL_TIME, CPU_TIME }; + explicit FastClock(Type type) + : type_(type), approx_time_(NowMicros()) { + + sem_init(&bg_done_, 0, 0); + pthread_create(&bg_, NULL, &BGThreadWrapper, this); + } + + ~FastClock() { + sem_post(&bg_done_); + pthread_join(bg_, NULL); + sem_destroy(&bg_done_); + } + + // Returns true if the current time is guaranteed to be past "when_micros". + // This method is very fast. + inline bool HasReached(int64_t when_micros) { + return approx_time_ >= when_micros; + // NOTE: this is the same as we're dealing with an int64_t + //return (base::subtle::NoBarrier_Load(&approx_time_) >= when_micros); + } + + // Returns the current time in microseconds past the epoch. + int64_t NowMicros() const { + double t = 0; + switch (type_) { + case REAL_TIME: + t = walltime::Now(); + break; + case CPU_TIME: + t = MyCPUUsage() + ChildrenCPUUsage(); + break; + } + return static_cast(t * 1e6); + } + + // Reinitialize if necessary (since clock type may be change once benchmark + // function starts running - see UseRealTime). + void InitType(Type type) { + type_ = type; + approx_time_ = NowMicros(); + // NOTE: This is the same barring a memory barrier + // base::subtle::Release_Store(&approx_time_, NowMicros()); + } + + private: + Type type_; + int64_t approx_time_; // Last time measurement taken by bg_ + pthread_t bg_; // Background thread that updates last_time_ once every ms + + sem_t bg_done_; + + static void* BGThreadWrapper(void* that) { + ((FastClock*)that)->BGThread(); + return NULL; + } + + void BGThread() { + int done = 0; + do { + SleepForMicroseconds(1000); + approx_time_ = NowMicros(); + // NOTE: same code but no memory barrier. think on it. + //base::subtle::Release_Store(&approx_time_, NowMicros()); + sem_getvalue(&bg_done_, &done); + } while (done == 0); + } + + DISALLOW_COPY_AND_ASSIGN(FastClock); +}; + +namespace internal { + +const int Benchmark::kNumCpuMarker; + +// Information kept per benchmark we may want to run +struct Benchmark::Instance { + Instance() + : rangeXset(false), rangeX(kNoRange), + rangeYset(false), rangeY(kNoRange) {} + + std::string name; + Benchmark* bm; + bool rangeXset; + int rangeX; + bool rangeYset; + int rangeY; + int threads; // Number of concurrent threads to use + + bool multithreaded() const { return !bm->thread_counts_.empty(); } +}; + +} // end namespace internal + +struct State::SharedState { + const internal::Benchmark::Instance* instance; + pthread_mutex_t mu; + int starting; // Number of threads that have entered STARTING state + int stopping; // Number of threads that have entered STOPPING state + int threads; // Number of total threads that are running concurrently + internal::Benchmark::ThreadStats stats; + std::vector runs; // accumulated runs + std::string label; + + SharedState(const internal::Benchmark::Instance* b, int t) + : instance(b), starting(0), stopping(0), threads(t) { + } + DISALLOW_COPY_AND_ASSIGN(SharedState); +}; + +namespace internal { + +Benchmark::Benchmark(const char* name, BenchmarkFunction f) + : name_(name), function_(f) { + mutex_lock l(&benchmark_mutex); + if (families == nullptr) + families = new std::vector; + registration_index_ = families->size(); + families->push_back(this); +} + +Benchmark::~Benchmark() { + mutex_lock l(&benchmark_mutex); + CHECK((*families)[registration_index_] == this); + (*families)[registration_index_] = NULL; + // Shrink the vector if convenient. + while (!families->empty() && families->back() == NULL) + families->pop_back(); +} + +Benchmark* Benchmark::Arg(int x) { + mutex_lock l(&benchmark_mutex); + rangeX_.push_back(x); + return this; +} + +Benchmark* Benchmark::Range(int start, int limit) { + std::vector arglist; + AddRange(&arglist, start, limit, kRangeMultiplier); + + mutex_lock l(&benchmark_mutex); + for (size_t i = 0; i < arglist.size(); ++i) + rangeX_.push_back(arglist[i]); + return this; +} + +Benchmark* Benchmark::DenseRange(int start, int limit) { + CHECK_GE(start, 0); + CHECK_LE(start, limit); + mutex_lock l(&benchmark_mutex); + for (int arg = start; arg <= limit; ++arg) + rangeX_.push_back(arg); + return this; +} + +Benchmark* Benchmark::ArgPair(int x, int y) { + mutex_lock l(&benchmark_mutex); + rangeX_.push_back(x); + rangeY_.push_back(y); + return this; +} + +Benchmark* Benchmark::RangePair(int lo1, int hi1, int lo2, int hi2) { + std::vector arglist1, arglist2; + AddRange(&arglist1, lo1, hi1, kRangeMultiplier); + AddRange(&arglist2, lo2, hi2, kRangeMultiplier); + + mutex_lock l(&benchmark_mutex); + rangeX_.resize(arglist1.size()); + std::copy(arglist1.begin(), arglist1.end(), rangeX_.begin()); + rangeY_.resize(arglist2.size()); + std::copy(arglist2.begin(), arglist2.end(), rangeY_.begin()); + return this; +} + +Benchmark* Benchmark::Apply(void (*custom_arguments)(Benchmark* benchmark)) { + custom_arguments(this); + return this; +} + +Benchmark* Benchmark::Threads(int t) { + CHECK_GT(t, 0); + mutex_lock l(&benchmark_mutex); + thread_counts_.push_back(t); + return this; +} + +Benchmark* Benchmark::ThreadRange(int min_threads, int max_threads) { + CHECK_GT(min_threads, 0); + CHECK_GE(max_threads, min_threads); + + mutex_lock l(&benchmark_mutex); + AddRange(&thread_counts_, min_threads, max_threads, 2); + return this; +} + +Benchmark* Benchmark::ThreadPerCpu() { + mutex_lock l(&benchmark_mutex); + thread_counts_.push_back(kNumCpuMarker); + return this; +} + +void Benchmark::AddRange(std::vector* dst, int lo, int hi, int mult) { + CHECK_GE(lo, 0); + CHECK_GE(hi, lo); + + // Add "lo" + dst->push_back(lo); + + // Now space out the benchmarks in multiples of "mult" + for (int32_t i = 1; i < std::numeric_limits::max()/mult; i *= mult) { + if (i >= hi) break; + if (i > lo) + dst->push_back(i); + } + // Add "hi" (if different from "lo") + if (hi != lo) + dst->push_back(hi); +} + +std::vector Benchmark::CreateBenchmarkInstances( + int rangeXindex, int rangeYindex) { + // Special list of thread counts to use when none are specified + std::vector one_thread; + one_thread.push_back(1); + + std::vector instances; + + const bool is_multithreaded = (!thread_counts_.empty()); + const std::vector* thread_counts = + (is_multithreaded ? &thread_counts_ : &one_thread); + for (size_t t = 0; t < thread_counts->size(); ++t) { + int num_threads = (*thread_counts)[t]; + if (num_threads == kNumCpuMarker) + num_threads = NumCPUs(); + + Instance instance; + instance.name = name_; + instance.bm = this; + instance.threads = num_threads; + + if (rangeXindex != kNoRange) { + instance.rangeX = rangeX_[rangeXindex]; + instance.rangeXset = true; + AppendHumanReadable(instance.rangeX, &instance.name); + } + if (rangeYindex != kNoRange) { + instance.rangeY = rangeY_[rangeYindex]; + instance.rangeYset = true; + AppendHumanReadable(instance.rangeY, &instance.name); + } + + // Add the number of threads used to the name + if (is_multithreaded) { + std::stringstream ss; + ss << "/threads:" << instance.threads; + instance.name += ss.str(); + } + + instances.push_back(instance); + } + + return instances; +} + +// Extract the list of benchmark instances that match the specified +// regular expression. +void Benchmark::FindBenchmarks(const std::string& spec, + std::vector* benchmarks) { + // Make regular expression out of command-line flag + regex_t re; + int ec = regcomp(&re, spec.c_str(), REG_EXTENDED | REG_NOSUB); + if (ec != 0) { + size_t needed = regerror(ec, &re, NULL, 0); + char* errbuf = new char[needed]; + regerror(ec, &re, errbuf, needed); + std::cerr << "Could not compile benchmark re: " << errbuf << "\n"; + delete[] errbuf; + return; + } + + mutex_lock l(&benchmark_mutex); + for (Benchmark* family : *families) { + if (family == nullptr) continue; // Family was deleted + + // Match against filter. + if (regexec(&re, family->name_.c_str(), 0, NULL, 0) != 0) { +#ifdef DEBUG + std::cout << "Skipping " << family->name_ << "\n"; +#endif + continue; + } + + std::vector instances; + if (family->rangeX_.empty() && family->rangeY_.empty()) { + instances = family->CreateBenchmarkInstances(kNoRange, kNoRange); + benchmarks->insert(benchmarks->end(), instances.begin(), instances.end()); + } else if (family->rangeY_.empty()) { + for (size_t x = 0; x < family->rangeX_.size(); ++x) { + instances = family->CreateBenchmarkInstances(x, kNoRange); + benchmarks->insert(benchmarks->end(), + instances.begin(), instances.end()); + } + } else { + for (size_t x = 0; x < family->rangeX_.size(); ++x) { + for (size_t y = 0; y < family->rangeY_.size(); ++y) { + instances = family->CreateBenchmarkInstances(x, y); + benchmarks->insert(benchmarks->end(), + instances.begin(), instances.end()); + } + } + } + } +} + +void Benchmark::MeasureOverhead() { + State::FastClock clock(State::FastClock::CPU_TIME); + State::SharedState state(NULL, 1); + State runner(&clock, &state, 0); + while (runner.KeepRunning()) {} + overhead = state.runs[0].real_accumulated_time / + static_cast(state.runs[0].iterations); +#ifdef DEBUG + std::cout << "Per-iteration overhead for doing nothing: " << overhead << "\n"; +#endif +} + +void Benchmark::RunInstance(const Instance& b, BenchmarkReporter* br) { + use_real_time = false; + running_benchmark = true; + // get_memory_usage = FLAGS_gbenchmark_memory_usage; + State::FastClock clock(State::FastClock::CPU_TIME); + + // Initialize the test runners. + State::SharedState state(&b, b.threads); + { + std::unique_ptr runners[b.threads]; + // TODO: create thread objects + for (int i = 0; i < b.threads; ++i) + runners[i].reset(new State(&clock, &state, i)); + + // Run them all. + for (int i = 0; i < b.threads; ++i) { + State* r = runners[i].release(); + if (b.multithreaded()) { + // TODO: start pthreads (member of state?) and set up thread local + // pointers to stats + //pool->Add(base::NewCallback(r, &State::Run)); + } else { + pthread_setspecific(thread_stats_key, thread_stats); + r->Run(); + } + } + if (b.multithreaded()) { + // TODO: join all the threads + //pool->JoinAll(); + } + } +/* + double mem_usage = 0; + if (get_memory_usage) { + // Measure memory usage + Notification mem_done; + BenchmarkRun mem_run; + BenchmarkRun::SharedState mem_shared(&b, 1); + mem_run.Init(&clock, &mem_shared, 0); + { + testing::MallocCounter mc(testing::MallocCounter::THIS_THREAD_ONLY); + benchmark_mc = &mc; + mem_run.Run(&mem_done); + mem_done.WaitForNotification(); + benchmark_mc = NULL; + mem_usage = mc.PeakHeapGrowth(); + } + } +*/ + running_benchmark = false; + + for (internal::BenchmarkRunData& report : state.runs) { + double seconds = (use_real_time ? report.real_accumulated_time : + report.cpu_accumulated_time); + // TODO: add the thread index here? + report.benchmark_name = b.name; + report.report_label = state.label; + report.bytes_per_second = state.stats.bytes_processed / seconds; + report.items_per_second = state.stats.items_processed / seconds; + report.max_heapbytes_used = MeasurePeakHeapMemory(b); + } + + br->ReportRuns(state.runs); +} + +// Run the specified benchmark, measure its peak memory usage, and +// return the peak memory usage. +double Benchmark::MeasurePeakHeapMemory(const Instance& b) { + if (!get_memory_usage) + return 0.0; + double bytes = 0.0; + /* TODO(dominich) + // Should we do multi-threaded runs? + const int num_threads = 1; + const int num_iters = 1; + { +// internal::MallocCounter mc(internal::MallocCounter::THIS_THREAD_ONLY); + running_benchmark = true; + timer_manager = new TimerManager(1, NULL); +// benchmark_mc = &mc; + timer_manager->StartTimer(); + + b.Run(num_iters); + + running_benchmark = false; + delete timer_manager; + timer_manager = NULL; +// benchmark_mc = NULL; +// bytes = mc.PeakHeapGrowth(); + } + */ + return bytes; +} + +} // end namespace internal + +State::State(FastClock* clock, SharedState* s, int t) + : thread_index(t), + state_(STATE_INITIAL), + clock_(clock), + shared_(s), + iterations_(0), + start_cpu_(0.0), + start_time_(0.0), + stop_time_micros_(0.0), + start_pause_(0.0), + pause_time_(0.0), + total_iterations_(0), + interval_micros_( + static_cast(1e6 * FLAGS_benchmark_min_time / + FLAGS_benchmark_repetitions)) { +} + +bool State::KeepRunning() { + // Fast path + if (!clock_->HasReached(stop_time_micros_ + pause_time_)) { + ++iterations_; + return true; + } + + switch(state_) { + case STATE_INITIAL: return StartRunning(); + case STATE_STARTING: CHECK(false); return true; + case STATE_RUNNING: return FinishInterval(); + case STATE_STOPPING: return MaybeStop(); + case STATE_STOPPED: CHECK(false); return true; + } + CHECK(false); + return false; +} + +void State::PauseTiming() { + start_pause_ = walltime::Now(); +} + +void State::ResumeTiming() { + pause_time_ += walltime::Now() - start_pause_; +} + +void State::SetBytesProcessed(int64_t bytes) { + CHECK_EQ(STATE_STOPPED, state_); + mutex_lock l(&shared_->mu); + internal::Benchmark::ThreadStats* thread_stats = + (internal::Benchmark::ThreadStats*) pthread_getspecific(thread_stats_key); + thread_stats->bytes_processed = bytes; +} + +void State::SetItemsProcessed(int64_t items) { + CHECK_EQ(STATE_STOPPED, state_); + mutex_lock l(&shared_->mu); + internal::Benchmark::ThreadStats* thread_stats = + (internal::Benchmark::ThreadStats*) pthread_getspecific(thread_stats_key); + thread_stats->items_processed = items; +} + +void State::SetLabel(const std::string& label) { + CHECK_EQ(STATE_STOPPED, state_); + mutex_lock l(&shared_->mu); + shared_->label = label; +} + +int State::range_x() const { + CHECK(shared_->instance->rangeXset); + /* + << + "Failed to get range_x as it was not set. Did you register your " + "benchmark with a range parameter?"; + */ + return shared_->instance->rangeX; +} + +int State::range_y() const { + CHECK(shared_->instance->rangeYset); + /* << + "Failed to get range_y as it was not set. Did you register your " + "benchmark with a range parameter?"; + */ + return shared_->instance->rangeY; +} + +bool State::StartRunning() { + { + mutex_lock l(&shared_->mu); + CHECK_EQ(state_, STATE_INITIAL); + state_ = STATE_STARTING; + is_continuation_ = false; + CHECK_LT(shared_->starting, shared_->threads); + ++shared_->starting; + if (shared_->starting == shared_->threads) { + // Last thread to start. + clock_->InitType( + use_real_time ? FastClock::REAL_TIME : FastClock::CPU_TIME); + } else { + // Wait for others. + // TODO(dominic): semaphore! + // while (pthread_getsemaphore(shared_->starting_sem_) != + // shared_->threads) { } + //shared_->mu.Await(base::Condition(this, &State::AllStarting)); + } + CHECK_EQ(state_, STATE_STARTING); + state_ = STATE_RUNNING; + } + NewInterval(); + return true; +} + +bool State::AllStarting() { + CHECK_LE(shared_->starting, shared_->threads); + return shared_->starting == shared_->threads; +} + +void State::NewInterval() { + stop_time_micros_ = clock_->NowMicros() + interval_micros_; + if (!is_continuation_) { +#ifdef DEBUG + std::cout << "Starting new interval; stopping in " << interval_micros_ + << "\n"; +#endif + iterations_ = 0; + pause_time_ = 0; + start_cpu_ = MyCPUUsage() + ChildrenCPUUsage(); + start_time_ = walltime::Now(); + } else { +#ifdef DEBUG + std::cout << "Continuing interval; stopping in " << interval_micros_ + << "\n"; +#endif + } +} + +bool State::FinishInterval() { + if (iterations_ < FLAGS_benchmark_min_iters / FLAGS_benchmark_repetitions && + interval_micros_ < 5000000) { + interval_micros_ *= 2; +#ifdef DEBUG + std::cout << "Interval was too short; trying again for " + << interval_micros_ << " useconds.\n"; +#endif + is_continuation_ = false; + NewInterval(); + return true; + } + + internal::BenchmarkRunData data; + data.thread_index = thread_index; + data.iterations = iterations_; + data.thread_index = thread_index; + + const double accumulated_time = walltime::Now() - start_time_; + const double total_overhead = 0.0; // TODO: overhead * iterations_; + CHECK_LT(pause_time_, accumulated_time); + CHECK_LT(pause_time_ + total_overhead, accumulated_time); + data.real_accumulated_time = + accumulated_time - (pause_time_ + total_overhead); + data.cpu_accumulated_time = (MyCPUUsage() + ChildrenCPUUsage()) - start_cpu_; + total_iterations_ += iterations_; + + bool keep_going = false; + { + mutex_lock l(&shared_->mu); + if (is_continuation_) + shared_->runs.back() = data; + else + shared_->runs.push_back(data); + keep_going = RunAnotherInterval(); + if (!keep_going) { + ++shared_->stopping; + if (shared_->stopping < shared_->threads) { + // Other threads are still running, so continue running but without + // timing to present an expected background load to the other threads. + state_ = STATE_STOPPING; + keep_going = true; + } else { + state_ = STATE_STOPPED; + } + } + } + + if (state_ == STATE_RUNNING) { + is_continuation_ = true; + NewInterval(); + } + return keep_going; +} + +bool State::RunAnotherInterval() const { + if (total_iterations_ < FLAGS_benchmark_min_iters) + return true; + if (total_iterations_ > FLAGS_benchmark_max_iters) + return false; + if (static_cast(shared_->runs.size()) >= FLAGS_benchmark_repetitions) + return false; + return true; +} + +bool State::MaybeStop() { + mutex_lock l(&shared_->mu); + if (shared_->stopping < shared_->threads) { + CHECK_EQ(state_, STATE_STOPPING); + return true; + } + state_ = STATE_STOPPED; + return false; +} + +void State::Run() { + internal::Benchmark::ThreadStats* thread_stats = + (internal::Benchmark::ThreadStats*) pthread_getspecific(thread_stats_key); + thread_stats->Reset(); + shared_->instance->bm->function_(*this); + { + mutex_lock l(&shared_->mu); + shared_->stats.Add(*thread_stats); + } +} + +namespace internal { + +void RunMatchingBenchmarks(const std::string& spec, + BenchmarkReporter* reporter) { + CHECK(reporter != NULL); + if (spec.empty()) return; + + std::vector benchmarks; + internal::Benchmark::FindBenchmarks(spec, &benchmarks); + + // Determine the width of the name field using a minimum width of 10. + // Also determine max number of threads needed. + int name_field_width = 10; + for (const internal::Benchmark::Instance& benchmark : benchmarks) { + // Add width for _stddev and threads:XX + if (benchmark.threads > 1 && FLAGS_benchmark_repetitions > 1) { + name_field_width = std::max(name_field_width, + benchmark.name.size() + 17); + } else if (benchmark.threads> 1) { + name_field_width = std::max(name_field_width, + benchmark.name.size() + 10); + } else if (FLAGS_benchmark_repetitions > 1) { + name_field_width = std::max(name_field_width, + benchmark.name.size() + 7); + } else { + name_field_width = std::max(name_field_width, + benchmark.name.size()); + } + } + + // Print header here + BenchmarkContextData context; + context.num_cpus = NumCPUs(); + context.mhz_per_cpu = CyclesPerSecond() / 1000000.0f; +// context.cpu_info = base::CompactCPUIDInfoString(); + context.cpu_scaling_enabled = CpuScalingEnabled(); + context.name_field_width = name_field_width; + + if (reporter->ReportContext(context)) { + for (internal::Benchmark::Instance& benchmark : benchmarks) { + //std::unique_ptr pool; + //if (benchmark.threads > 0) { + // pool = new thread::ThreadPool(benchmark.threads); + // pool->StartWorkers(); + //} + Benchmark::RunInstance(/*pool, */benchmark, reporter); + } + } +} + +void FindMatchingBenchmarkNames(const std::string& spec, + std::vector* benchmark_names) { + if (spec.empty()) return; + + std::vector benchmarks; + internal::Benchmark::FindBenchmarks(spec, &benchmarks); + std::transform(benchmarks.begin(), benchmarks.end(), benchmark_names->begin(), + [] (const internal::Benchmark::Instance& b) { return b.name; } ); +} + +} // end namespace internal + +void RunSpecifiedBenchmarks() { + std::string spec = FLAGS_benchmark_filter; + if (spec.empty() || spec == "all") + spec = "."; // Regexp that matches all benchmarks + internal::ConsoleReporter default_reporter; + internal::RunMatchingBenchmarks(spec, &default_reporter); +} + +void Initialize(int* argc, const char** argv) { + //AtomicOps_Internalx86CPUFeaturesInit(); + pthread_mutex_init(&benchmark_mutex, nullptr); + pthread_key_create(&thread_stats_key, DeleteThreadStats); + thread_stats = new internal::Benchmark::ThreadStats(); + walltime::Initialize(); + internal::Benchmark::MeasureOverhead(); + internal::ParseCommandLineFlags(argc, argv); +} + + +} // end namespace benchmark diff --git a/src/colorprint.cc b/src/colorprint.cc new file mode 100644 index 00000000..42d69cb6 --- /dev/null +++ b/src/colorprint.cc @@ -0,0 +1,82 @@ +#include "colorprint.h" + +#include + +#include "commandlineflags.h" + +DECLARE_bool(color_print); + +namespace { +#ifdef OS_WINDOWS +typedef WORD PlatformColorCode; +#else +typedef const char* PlatformColorCode; +#endif + +PlatformColorCode GetPlatformColorCode(LogColor color) { +#ifdef OS_WINDOWS + switch (color) { + case COLOR_RED: return FOREGROUND_RED; + case COLOR_GREEN: return FOREGROUND_GREEN; + case COLOR_YELLOW: return FOREGROUND_RED | FOREGROUND_GREEN; + case COLOR_BLUE: return FOREGROUND_BLUE; + case COLOR_MAGENTA: return FOREGROUND_BLUE | FOREGROUND_RED; + case COLOR_CYAN: return FOREGROUND_BLUE | FOREGROUND_GREEN; + case COLOR_WHITE: // fall through to default + default: return 0; + } +#else + switch (color) { + case COLOR_RED: return "1"; + case COLOR_GREEN: return "2"; + case COLOR_YELLOW: return "3"; + case COLOR_BLUE: return "4"; + case COLOR_MAGENTA: return "5"; + case COLOR_CYAN: return "6"; + case COLOR_WHITE: return "7"; + default: return NULL; + }; +#endif +} +} // end namespace + +void ColorPrintf(LogColor color, const char* fmt, ...) { + va_list args; + va_start(args, fmt); + + if (!FLAGS_color_print) { + vprintf(fmt, args); + va_end(args); + return; + } + +#ifdef OS_WINDOWS + const HANDLE stdout_handle = GetStdHandle(STD_OUTPUT_HANDLE); + + // Gets the current text color. + CONSOLE_SCREEN_BUFFER_INFO buffer_info; + GetConsoleScreenBufferInfo(stdout_handle, &buffer_info); + const WORD old_color_attrs = buffer_info.wAttributes; + + // We need to flush the stream buffers into the console before each + // SetConsoleTextAttribute call lest it affect the text that is already + // printed but has not yet reached the console. + fflush(stdout); + SetConsoleTextAttribute(stdout_handle, + GetPlatformColorCode(color) | FOREGROUND_INTENSITY); + vprintf(fmt, args); + + fflush(stdout); + // Restores the text color. + SetConsoleTextAttribute(stdout_handle, old_color_attrs); +#else + const char* color_code = GetPlatformColorCode(color); + if (color_code) + fprintf(stdout, "\033[0;3%sm", color_code); + vprintf(fmt, args); + printf("\033[m"); // Resets the terminal to default. +#endif + va_end(args); +} + + diff --git a/src/colorprint.h b/src/colorprint.h new file mode 100644 index 00000000..5789c2e7 --- /dev/null +++ b/src/colorprint.h @@ -0,0 +1,17 @@ +#ifndef BENCHMARK_COLORPRINT_H_ +#define BENCHMARK_COLORPRINT_H_ + +enum LogColor { + COLOR_DEFAULT, + COLOR_RED, + COLOR_GREEN, + COLOR_YELLOW, + COLOR_BLUE, + COLOR_MAGENTA, + COLOR_CYAN, + COLOR_WHITE +}; + +void ColorPrintf(LogColor color, const char* fmt, ...); + +#endif // BENCHMARK_COLORPRINT_H_ diff --git a/src/commandlineflags.cc b/src/commandlineflags.cc new file mode 100644 index 00000000..331b8ff5 --- /dev/null +++ b/src/commandlineflags.cc @@ -0,0 +1,213 @@ +#include "commandlineflags.h" + +#include + +#include +#include + +namespace benchmark { +// Parses 'str' for a 32-bit signed integer. If successful, writes +// the result to *value and returns true; otherwise leaves *value +// unchanged and returns false. +bool ParseInt32(const std::string& src_text, const char* str, int32_t* value) { + // Parses the environment variable as a decimal integer. + char* end = NULL; + const long long_value = strtol(str, &end, 10); // NOLINT + + // Has strtol() consumed all characters in the string? + if (*end != '\0') { + // No - an invalid character was encountered. + std::cerr << src_text << " is expected to be a 32-bit integer, " + << "but actually has value \"" << str << "\".\n"; + return false; + } + + // Is the parsed value in the range of an Int32? + const int32_t result = static_cast(long_value); + if (long_value == std::numeric_limits::max() || + long_value == std::numeric_limits::min() || + // The parsed value overflows as a long. (strtol() returns + // LONG_MAX or LONG_MIN when the input overflows.) + result != long_value + // The parsed value overflows as an Int32. + ) { + std::cerr << src_text << " is expected to be a 32-bit integer, " + << "but actually has value \"" << str << "\", " + << "which overflows.\n"; + return false; + } + + *value = result; + return true; +} + +// Parses 'str' for a double. If successful, writes the result to *value and +// returns true; otherwise leaves *value unchanged and returns false. +bool ParseDouble(const std::string& src_text, const char* str, double* value) { + // Parses the environment variable as a decimal integer. + char* end = NULL; + const double double_value = strtod(str, &end); // NOLINT + + // Has strtol() consumed all characters in the string? + if (*end != '\0') { + // No - an invalid character was encountered. + std::cerr << src_text << " is expected to be a double, " + << "but actually has value \"" << str << "\".\n"; + return false; + } + + *value = double_value; + return true; +} + + +inline const char* GetEnv(const char* name) { +#if GTEST_OS_WINDOWS_MOBILE + // We are on Windows CE, which has no environment variables. + return NULL; +#elif defined(__BORLANDC__) || defined(__SunOS_5_8) || defined(__SunOS_5_9) + // Environment variables which we programmatically clear will be set to the + // empty string rather than unset (NULL). Handle that case. + const char* const env = getenv(name); + return (env != NULL && env[0] != '\0') ? env : NULL; +#else + return getenv(name); +#endif +} + +// Returns the name of the environment variable corresponding to the +// given flag. For example, FlagToEnvVar("foo") will return +// "BENCHMARK_FOO" in the open-source version. +static std::string FlagToEnvVar(const char* flag) { + const std::string flag_str(flag); + + std::string env_var; + for (size_t i = 0; i != flag_str.length(); ++i) + env_var += ::toupper(flag_str.c_str()[i]); + + return "BENCHMARK_" + env_var; +} + +// Reads and returns the Boolean environment variable corresponding to +// the given flag; if it's not set, returns default_value. +// +// The value is considered true iff it's not "0". +bool BoolFromEnv(const char* flag, bool default_value) { + const std::string env_var = FlagToEnvVar(flag); + const char* const string_value = GetEnv(env_var.c_str()); + return string_value == NULL ? + default_value : strcmp(string_value, "0") != 0; +} + +// Reads and returns a 32-bit integer stored in the environment +// variable corresponding to the given flag; if it isn't set or +// doesn't represent a valid 32-bit integer, returns default_value. +int32_t Int32FromEnv(const char* flag, int32_t default_value) { + const std::string env_var = FlagToEnvVar(flag); + const char* const string_value = GetEnv(env_var.c_str()); + if (string_value == NULL) { + // The environment variable is not set. + return default_value; + } + + int32_t result = default_value; + if (!ParseInt32(std::string("Environment variable ") + env_var, + string_value, &result)) { + std::cout << "The default value " << default_value << " is used.\n"; + return default_value; + } + + return result; +} + +// Reads and returns the string environment variable corresponding to +// the given flag; if it's not set, returns default_value. +const char* StringFromEnv(const char* flag, const char* default_value) { + const std::string env_var = FlagToEnvVar(flag); + const char* const value = GetEnv(env_var.c_str()); + return value == NULL ? default_value : value; +} + +// Parses a string as a command line flag. The string should have +// the format "--flag=value". When def_optional is true, the "=value" +// part can be omitted. +// +// Returns the value of the flag, or NULL if the parsing failed. +const char* ParseFlagValue(const char* str, + const char* flag, + bool def_optional) { + // str and flag must not be NULL. + if (str == NULL || flag == NULL) return NULL; + + // The flag must start with "--". + const std::string flag_str = std::string("--") + std::string(flag); + const size_t flag_len = flag_str.length(); + if (strncmp(str, flag_str.c_str(), flag_len) != 0) return NULL; + + // Skips the flag name. + const char* flag_end = str + flag_len; + + // When def_optional is true, it's OK to not have a "=value" part. + if (def_optional && (flag_end[0] == '\0')) + return flag_end; + + // If def_optional is true and there are more characters after the + // flag name, or if def_optional is false, there must be a '=' after + // the flag name. + if (flag_end[0] != '=') return NULL; + + // Returns the string after "=". + return flag_end + 1; +} + +bool ParseBoolFlag(const char* str, const char* flag, bool* value) { + // Gets the value of the flag as a string. + const char* const value_str = ParseFlagValue(str, flag, true); + + // Aborts if the parsing failed. + if (value_str == NULL) return false; + + // Converts the string value to a bool. + *value = !(*value_str == '0' || *value_str == 'f' || *value_str == 'F'); + return true; +} + +bool ParseInt32Flag(const char* str, const char* flag, int32_t* value) { + // Gets the value of the flag as a string. + const char* const value_str = ParseFlagValue(str, flag, false); + + // Aborts if the parsing failed. + if (value_str == NULL) return false; + + // Sets *value to the value of the flag. + return ParseInt32(std::string("The value of flag --") + flag, + value_str, value); +} + +bool ParseDoubleFlag(const char* str, const char* flag, double* value) { + // Gets the value of the flag as a string. + const char* const value_str = ParseFlagValue(str, flag, false); + + // Aborts if the parsing failed. + if (value_str == NULL) return false; + + // Sets *value to the value of the flag. + return ParseDouble(std::string("The value of flag --") + flag, + value_str, value); +} + +bool ParseStringFlag(const char* str, const char* flag, std::string* value) { + // Gets the value of the flag as a string. + const char* const value_str = ParseFlagValue(str, flag, false); + + // Aborts if the parsing failed. + if (value_str == NULL) return false; + + *value = value_str; + return true; +} + +bool IsFlag(const char* str, const char* flag) { + return (ParseFlagValue(str, flag, true) != NULL); +} +} // end namespace benchmark diff --git a/src/commandlineflags.h b/src/commandlineflags.h new file mode 100644 index 00000000..056d9fc0 --- /dev/null +++ b/src/commandlineflags.h @@ -0,0 +1,79 @@ +#ifndef BENCHMARK_COMMANDLINEFLAGS_H_ +#define BENCHMARK_COMMANDLINEFLAGS_H_ + +#include + +#include + +// Macro for referencing flags. +#define FLAG(name) FLAGS_##name + +// Macros for declaring flags. +#define DECLARE_bool(name) extern bool FLAG(name) +#define DECLARE_int32(name) extern int32_t FLAG(name) +#define DECLARE_int64(name) extern int64_t FLAG(name) +#define DECLARE_double(name) extern double FLAG(name) +#define DECLARE_string(name) extern std::string FLAG(name) + +// Macros for defining flags. +#define DEFINE_bool(name, default_val, doc) bool FLAG(name) = (default_val) +#define DEFINE_int32(name, default_val, doc) int32_t FLAG(name) = (default_val) +#define DEFINE_int64(name, default_val, doc) int64_t FLAG(name) = (default_val) +#define DEFINE_double(name, default_val, doc) double FLAG(name) = (default_val) +#define DEFINE_string(name, default_val, doc) \ + std::string FLAG(name) = (default_val) + +namespace benchmark { + +// Parses 'str' for a 32-bit signed integer. If successful, writes the result +// to *value and returns true; otherwise leaves *value unchanged and returns +// false. +bool ParseInt32(const std::string& src_text, const char* str, int32_t* value); + +// Parses a bool/Int32/string from the environment variable +// corresponding to the given Google Test flag. +bool BoolFromEnv(const char* flag, bool default_val); +int32_t Int32FromEnv(const char* flag, int32_t default_val); +double DoubleFromEnv(const char* flag, double default_val); +const char* StringFromEnv(const char* flag, const char* default_val); + +// Parses a string for a bool flag, in the form of either +// "--flag=value" or "--flag". +// +// In the former case, the value is taken as true as long as it does +// not start with '0', 'f', or 'F'. +// +// In the latter case, the value is taken as true. +// +// On success, stores the value of the flag in *value, and returns +// true. On failure, returns false without changing *value. +bool ParseBoolFlag(const char* str, const char* flag, bool* value); + +// Parses a string for an Int32 flag, in the form of +// "--flag=value". +// +// On success, stores the value of the flag in *value, and returns +// true. On failure, returns false without changing *value. +bool ParseInt32Flag(const char* str, const char* flag, int32_t* value); + +// Parses a string for a Double flag, in the form of +// "--flag=value". +// +// On success, stores the value of the flag in *value, and returns +// true. On failure, returns false without changing *value. +bool ParseDoubleFlag(const char* str, const char* flag, double* value); + +// Parses a string for a string flag, in the form of +// "--flag=value". +// +// On success, stores the value of the flag in *value, and returns +// true. On failure, returns false without changing *value. +bool ParseStringFlag(const char* str, const char* flag, std::string* value); + +// Returns true if the string matches the flag. +bool IsFlag(const char* str, const char* flag); + +} // end namespace gbenchmark + +#endif // BENCHMARK_COMMANDLINEFLAGS_H_ + diff --git a/src/cycleclock.h b/src/cycleclock.h new file mode 100644 index 00000000..d5ba314f --- /dev/null +++ b/src/cycleclock.h @@ -0,0 +1,129 @@ +// ---------------------------------------------------------------------- +// CycleClock +// A CycleClock tells you the current time in Cycles. The "time" +// is actually time since power-on. This is like time() but doesn't +// involve a system call and is much more precise. +// +// NOTE: Not all cpu/platform/kernel combinations guarantee that this +// clock increments at a constant rate or is synchronized across all logical +// cpus in a system. +// +// If you need the above guarantees, please consider using a different +// API. There are efforts to provide an interface which provides a millisecond +// granularity and implemented as a memory read. A memory read is generally +// cheaper than the CycleClock for many architectures. +// +// Also, in some out of order CPU implementations, the CycleClock is not +// serializing. So if you're trying to count at cycles granularity, your +// data might be inaccurate due to out of order instruction execution. +// ---------------------------------------------------------------------- + +#ifndef BENCHMARK_CYCLECLOCK_H_ +#define BENCHMARK_CYCLECLOCK_H_ + +#include + +#if defined(OS_MACOSX) +# include +#endif +// For MSVC, we want to use '_asm rdtsc' when possible (since it works +// with even ancient MSVC compilers), and when not possible the +// __rdtsc intrinsic, declared in . Unfortunately, in some +// environments, and have conflicting +// declarations of some other intrinsics, breaking compilation. +// Therefore, we simply declare __rdtsc ourselves. See also +// http://connect.microsoft.com/VisualStudio/feedback/details/262047 +#if defined(COMPILER_MSVC) && !defined(_M_IX86) +extern "C" uint64_t __rdtsc(); +#pragma intrinsic(__rdtsc) +#endif +#include + +// NOTE: only i386 and x86_64 have been well tested. +// PPC, sparc, alpha, and ia64 are based on +// http://peter.kuscsik.com/wordpress/?p=14 +// with modifications by m3b. See also +// https://setisvn.ssl.berkeley.edu/svn/lib/fftw-3.0.1/kernel/cycle.h +struct CycleClock { + // This should return the number of cycles since power-on. Thread-safe. + static inline int64_t Now() { +#if defined(OS_MACOSX) + // this goes at the top because we need ALL Macs, regardless of + // architecture, to return the number of "mach time units" that + // have passed since startup. See sysinfo.cc where + // InitializeSystemInfo() sets the supposed cpu clock frequency of + // macs to the number of mach time units per second, not actual + // CPU clock frequency (which can change in the face of CPU + // frequency scaling). Also note that when the Mac sleeps, this + // counter pauses; it does not continue counting, nor does it + // reset to zero. + return mach_absolute_time(); +#elif defined(__i386__) + int64_t ret; + __asm__ volatile ("rdtsc" : "=A" (ret) ); + return ret; +#elif defined(__x86_64__) || defined(__amd64__) + uint64_t low, high; + __asm__ volatile ("rdtsc" : "=a" (low), "=d" (high)); + return (high << 32) | low; +#elif defined(__powerpc__) || defined(__ppc__) + // This returns a time-base, which is not always precisely a cycle-count. + int64_t tbl, tbu0, tbu1; + asm("mftbu %0" : "=r" (tbu0)); + asm("mftb %0" : "=r" (tbl)); + asm("mftbu %0" : "=r" (tbu1)); + tbl &= -static_cast(tbu0 == tbu1); + // high 32 bits in tbu1; low 32 bits in tbl (tbu0 is garbage) + return (tbu1 << 32) | tbl; +#elif defined(__sparc__) + int64_t tick; + asm(".byte 0x83, 0x41, 0x00, 0x00"); + asm("mov %%g1, %0" : "=r" (tick)); + return tick; +#elif defined(__ia64__) + int64_t itc; + asm("mov %0 = ar.itc" : "=r" (itc)); + return itc; +#elif defined(COMPILER_MSVC) && defined(_M_IX86) + // Older MSVC compilers (like 7.x) don't seem to support the + // __rdtsc intrinsic properly, so I prefer to use _asm instead + // when I know it will work. Otherwise, I'll use __rdtsc and hope + // the code is being compiled with a non-ancient compiler. + _asm rdtsc +#elif defined(COMPILER_MSVC) + return __rdtsc(); +#elif defined(ARMV3) +#if defined(ARMV6) // V6 is the earliest arch that has a standard cyclecount + uint32_t pmccntr; + uint32_t pmuseren; + uint32_t pmcntenset; + // Read the user mode perf monitor counter access permissions. + asm("mrc p15, 0, %0, c9, c14, 0" : "=r" (pmuseren)); + if (pmuseren & 1) { // Allows reading perfmon counters for user mode code. + asm("mrc p15, 0, %0, c9, c12, 1" : "=r" (pmcntenset)); + if (pmcntenset & 0x80000000ul) { // Is it counting? + asm("mrc p15, 0, %0, c9, c13, 0" : "=r" (pmccntr)); + // The counter is set up to count every 64th cycle + return static_cast(pmccntr) * 64; // Should optimize to << 6 + } + } +#endif + struct timeval tv; + gettimeofday(&tv, NULL); + return static_cast(tv.tv_sec) * 1000000 + tv.tv_usec; +#elif defined(__mips__) + // mips apparently only allows rdtsc for superusers, so we fall + // back to gettimeofday. It's possible clock_gettime would be better. + struct timeval tv; + gettimeofday(&tv, NULL); + return static_cast(tv.tv_sec) * 1000000 + tv.tv_usec; +#else +// The soft failover to a generic implementation is automatic only for ARM. +// For other platforms the developer is expected to make an attempt to create +// a fast implementation and use generic version if nothing better is available. +#error You need to define CycleTimer for your OS and CPU +#endif + } +}; + +#endif // BENCHMARK_CYCLECLOCK_H_ diff --git a/src/macros.h b/src/macros.h new file mode 100644 index 00000000..b4703282 --- /dev/null +++ b/src/macros.h @@ -0,0 +1,110 @@ +#ifndef BENCHMARK_MACROS_H_ +#define BENCHMARK_MACROS_H_ + +#include + +#define DISALLOW_COPY_AND_ASSIGN(TypeName) \ + TypeName(const TypeName&); \ + void operator=(const TypeName&); + +// The arraysize(arr) macro returns the # of elements in an array arr. +// The expression is a compile-time constant, and therefore can be +// used in defining new arrays, for example. If you use arraysize on +// a pointer by mistake, you will get a compile-time error. +// +// One caveat is that, for C++03, arraysize() doesn't accept any array of +// an anonymous type or a type defined inside a function. In these rare +// cases, you have to use the unsafe ARRAYSIZE() macro below. This is +// due to a limitation in C++03's template system. The limitation has +// been removed in C++11. + +// This template function declaration is used in defining arraysize. +// Note that the function doesn't need an implementation, as we only +// use its type. +template +char (&ArraySizeHelper(T (&array)[N]))[N]; + +// That gcc wants both of these prototypes seems mysterious. VC, for +// its part, can't decide which to use (another mystery). Matching of +// template overloads: the final frontier. +#ifndef COMPILER_MSVC +template +char (&ArraySizeHelper(const T (&array)[N]))[N]; +#endif + +#define arraysize(array) (sizeof(ArraySizeHelper(array))) + +// The STATIC_ASSERT macro can be used to verify that a compile time +// expression is true. For example, you could use it to verify the +// size of a static array: +// +// STATIC_ASSERT(ARRAYSIZE(content_type_names) == CONTENT_NUM_TYPES, +// content_type_names_incorrect_size); +// +// or to make sure a struct is smaller than a certain size: +// +// STATIC_ASSERT(sizeof(foo) < 128, foo_too_large); +// +// The second argument to the macro is the name of the variable. If +// the expression is false, most compilers will issue a warning/error +// containing the name of the variable. + +template +struct StaticAssert { +}; + +#define STATIC_ASSERT(expr, msg) \ + typedef StaticAssert<(bool(expr))> msg[bool(expr) ? 1 : -1] + +// Implementation details of STATIC_ASSERT: +// +// - STATIC_ASSERT works by defining an array type that has -1 +// elements (and thus is invalid) when the expression is false. +// +// - The simpler definition +// +// #define STATIC_ASSERT(expr, msg) typedef char msg[(expr) ? 1 : -1] +// +// does not work, as gcc supports variable-length arrays whose sizes +// are determined at run-time (this is gcc's extension and not part +// of the C++ standard). As a result, gcc fails to reject the +// following code with the simple definition: +// +// int foo; +// STATIC_ASSERT(foo, msg); // not supposed to compile as foo is +// // not a compile-time constant. +// +// - By using the type StaticAssert<(bool(expr))>, we ensures that +// expr is a compile-time constant. (Template arguments must be +// determined at compile-time.) +// +// - The outer parentheses in StaticAssert<(bool(expr))> are necessary +// to work around a bug in gcc 3.4.4 and 4.0.1. If we had written +// +// StaticAssert +// +// instead, these compilers will refuse to compile +// +// STATIC_ASSERT(5 > 0, some_message); +// +// (They seem to think the ">" in "5 > 0" marks the end of the +// template argument list.) +// +// - The array size is (bool(expr) ? 1 : -1), instead of simply +// +// ((expr) ? 1 : -1). +// +// This is to avoid running into a bug in MS VC 7.1, which +// causes ((0.0) ? 1 : -1) to incorrectly evaluate to 1. + +#define CHECK(b) do { if (!(b)) assert(false); } while(0) +#define CHECK_EQ(a, b) CHECK((a) == (b)) +#define CHECK_GE(a, b) CHECK((a) >= (b)) +#define CHECK_LE(a, b) CHECK((a) <= (b)) +#define CHECK_GT(a, b) CHECK((a) > (b)) +#define CHECK_LT(a, b) CHECK((a) < (b)) + + +#define ATTRIBUTE_UNUSED __attribute__ ((unused)) + +#endif // BENCHMARK_MACROS_H_ diff --git a/src/mutex_lock.h b/src/mutex_lock.h new file mode 100644 index 00000000..40f0fdee --- /dev/null +++ b/src/mutex_lock.h @@ -0,0 +1,20 @@ +#ifndef BENCHMARK_MUTEX_LOCK_H_ +#define BENCHMARK_MUTEX_LOCK_H_ + +#include + +class mutex_lock { + public: + explicit mutex_lock(pthread_mutex_t* mu) : mu_(mu) { + pthread_mutex_lock(mu_); + } + + ~mutex_lock() { + pthread_mutex_unlock(mu_); + } + + private: + pthread_mutex_t* mu_; +}; + +#endif // BENCHMARK_MUTEX_LOCK_H_ diff --git a/src/port.h b/src/port.h new file mode 100644 index 00000000..7d8fe1cc --- /dev/null +++ b/src/port.h @@ -0,0 +1,8 @@ +#ifndef BENCHMARK_PORT_H_ +#define BENCHMARK_PORT_H_ + +#define DISALLOW_COPY_AND_ASSIGN(TypeName) \ + TypeName(const TypeName&); \ + void operator=(const TypeName&); + +#endif // BENCHMARK_PORT_H_ diff --git a/src/sleep.cc b/src/sleep.cc new file mode 100644 index 00000000..82292919 --- /dev/null +++ b/src/sleep.cc @@ -0,0 +1,42 @@ +#include "sleep.h" + +#include +#include + +#ifdef OS_WINDOWS + +// Window's _sleep takes milliseconds argument. +void SleepForMilliseconds(int milliseconds) { + _sleep(milliseconds); +} +void SleepForSeconds(double seconds) { + SleepForMilliseconds(static_cast(seconds * 1000)); +} + +#else // OS_WINDOWS + +static const int64_t kNumMillisPerSecond = 1000LL; +static const int64_t kNumMicrosPerMilli = 1000LL; +static const int64_t kNumMicrosPerSecond = kNumMillisPerSecond * 1000LL; +static const int64_t kNumNanosPerMicro = 1000LL; + +void SleepForMicroseconds(int64_t microseconds) { + struct timespec sleep_time; + sleep_time.tv_sec = microseconds / kNumMicrosPerSecond; + sleep_time.tv_nsec = (microseconds % kNumMicrosPerSecond) * kNumNanosPerMicro; + while (nanosleep(&sleep_time, &sleep_time) != 0 && errno == EINTR) + ; // Ignore signals and wait for the full interval to elapse. +} + +void SleepForMilliseconds(int milliseconds) { + SleepForMicroseconds(static_cast(milliseconds) * kNumMicrosPerMilli); +} + +void SleepForSeconds(double seconds) { + SleepForMicroseconds(static_cast(seconds * kNumMicrosPerSecond)); +} + +#endif // OS_WINDOWS + + + diff --git a/src/sleep.h b/src/sleep.h new file mode 100644 index 00000000..35b4263b --- /dev/null +++ b/src/sleep.h @@ -0,0 +1,10 @@ +#ifndef BENCHMARK_SLEEP_H_ +#define BENCHMARK_SLEEP_H_ + +#include + +void SleepForMicroseconds(int64_t microseconds); +void SleepForMilliseconds(int milliseconds); +void SleepForSeconds(double seconds); + +#endif // BENCHMARK_SLEEP_H_ diff --git a/src/stat.h b/src/stat.h new file mode 100644 index 00000000..b121d47e --- /dev/null +++ b/src/stat.h @@ -0,0 +1,306 @@ +#ifndef BENCHMARK_STAT_H_ +#define BENCHMARK_STAT_H_ + +#include +#include +#include + +template +class Stat1; + +template +class Stat1MinMax; + +typedef Stat1 Stat1_f; +typedef Stat1 Stat1_d; +typedef Stat1MinMax Stat1MinMax_f; +typedef Stat1MinMax Stat1MinMax_d; + +template class Vector2; +template class Vector3; +template class Vector4; + +template +class Stat1 { + public: + typedef Stat1 Self; + + Stat1() { + Clear(); + } + void Clear() { + numsamples_ = NumType(); + sum_squares_ = sum_ = VType(); + } + // Create a sample of value dat and weight 1 + explicit Stat1(const VType &dat) { + sum_ = dat; + sum_squares_ = Sqr(dat); + numsamples_ = 1; + } + // Create statistics for all the samples between begin (included) + // and end(excluded) + explicit Stat1(const VType *begin, const VType *end) { + Clear(); + for ( const VType *item = begin; item < end; ++item ) { + (*this) += Stat1(*item); + } + } + // Create a sample of value dat and weight w + Stat1(const VType &dat, const NumType &w) { + sum_ = w * dat; + sum_squares_ = w * Sqr(dat); + numsamples_ = w; + } + // Copy operator + Stat1(const Self &stat) { + sum_ = stat.sum_; + sum_squares_ = stat.sum_squares_; + numsamples_ = stat.numsamples_; + } + + inline Self &operator =(const Self &stat) { + sum_ = stat.sum_; + sum_squares_ = stat.sum_squares_; + numsamples_ = stat.numsamples_; + return (*this); + } + // Merge statistics from two sample sets. + inline Self &operator +=(const Self &stat) { + sum_ += stat.sum_; + sum_squares_+= stat.sum_squares_; + numsamples_ += stat.numsamples_; + return (*this); + } + // The operation opposite to += + inline Self &operator -=(const Self &stat) { + sum_ -= stat.sum_; + sum_squares_-= stat.sum_squares_; + numsamples_ -= stat.numsamples_; + return (*this); + } + // Multiply the weight of the set of samples by a factor k + inline Self &operator *=(const VType &k) { + sum_ *= k; + sum_squares_*= k; + numsamples_ *= k; + return (*this); + } + // Merge statistics from two sample sets. + inline Self operator + (const Self &stat) const { + return Self(*this) += stat; + } + // The operation opposite to + + inline Self operator - (const Self &stat) const { + return Self(*this) -= stat; + } + // Multiply the weight of the set of samples by a factor k + inline Self operator * (const VType &k) const { + return Self(*this) *= k; + } + // Return the total weight of this sample set + NumType NumSamples() const { + return numsamples_; + } + // Return the sum of this sample set + VType Sum() const { + return sum_; + } + // Return the mean of this sample set + VType Mean() const { + if (numsamples_ == 0) return VType(); + return sum_ * (1.0 / numsamples_); + } + // Return the mean of this sample set and compute the standard deviation at + // the same time. + VType Mean(VType *stddev) const { + if (numsamples_ == 0) return VType(); + VType mean = sum_ * (1.0 / numsamples_); + if (stddev) { + VType avg_squares = sum_squares_ * (1.0 / numsamples_); + *stddev = Sqrt(avg_squares - Sqr(mean)); + } + return mean; + } + // Return the standard deviation of the sample set + VType StdDev() const { + if (numsamples_ == 0) return VType(); + VType mean = Mean(); + VType avg_squares = sum_squares_ * (1.0 / numsamples_); + return Sqrt(avg_squares - Sqr(mean)); + } + private: + // Let i be the index of the samples provided (using +=) + // and weight[i],value[i] be the data of sample #i + // then the variables have the following meaning: + NumType numsamples_; // sum of weight[i]; + VType sum_; // sum of weight[i]*value[i]; + VType sum_squares_; // sum of weight[i]*value[i]^2; + + // Template function used to square a number. + // For a vector we square all components + template + static inline SType Sqr(const SType &dat) { + return dat * dat; + } + template + static inline Vector2 Sqr(const Vector2 &dat) { + return dat.MulComponents(dat); + } + template + static inline Vector3 Sqr(const Vector3 &dat) { + return dat.MulComponents(dat); + } + template + static inline Vector4 Sqr(const Vector4 &dat) { + return dat.MulComponents(dat); + } + + // Template function used to take the square root of a number. + // For a vector we square all components + template + static inline SType Sqrt(const SType &dat) { + // Avoid NaN due to imprecision in the calculations + if ( dat < 0 ) + return 0; + return sqrt(dat); + } + template + static inline Vector2 Sqrt(const Vector2 &dat) { + // Avoid NaN due to imprecision in the calculations + return Max(dat, Vector2()).Sqrt(); + } + template + static inline Vector3 Sqrt(const Vector3 &dat) { + // Avoid NaN due to imprecision in the calculations + return Max(dat, Vector3()).Sqrt(); + } + template + static inline Vector4 Sqrt(const Vector4 &dat) { + // Avoid NaN due to imprecision in the calculations + return Max(dat, Vector4()).Sqrt(); + } +}; + +// Useful printing function +template +inline std::ostream& operator<<(std::ostream& out, + const Stat1& s) { + out << "{ avg = " << s.Mean() + << " std = " << s.StdDev() + << " nsamples = " << s.NumSamples() << "}"; + return out; +} + + +// Stat1MinMax: same as Stat1, but it also +// keeps the Min and Max values; the "-" +// operator is disabled because it cannot be implemented +// efficiently +template +class Stat1MinMax : public Stat1 { + public: + typedef Stat1MinMax Self; + + Stat1MinMax() { + Clear(); + } + void Clear() { + Stat1::Clear(); + if (std::numeric_limits::has_infinity) { + min_ = std::numeric_limits::infinity(); + max_ = -std::numeric_limits::infinity(); + } else { + min_ = std::numeric_limits::max(); + max_ = std::numeric_limits::min(); + } + } + // Create a sample of value dat and weight 1 + explicit Stat1MinMax(const VType &dat) : Stat1(dat) { + max_ = dat; + min_ = dat; + } + // Create statistics for all the samples between begin (included) + // and end(excluded) + explicit Stat1MinMax(const VType *begin, const VType *end) { + Clear(); + for ( const VType *item = begin; item < end; ++item ) { + (*this) += Stat1MinMax(*item); + } + } + // Create a sample of value dat and weight w + Stat1MinMax(const VType &dat, const NumType &w) + : Stat1(dat, w) { + max_ = dat; + min_ = dat; + } + // Copy operator + Stat1MinMax(const Self &stat) : Stat1(stat) { + max_ = stat.max_; + min_ = stat.min_; + } + inline Self &operator =(const Self &stat) { + this->Stat1::operator=(stat); + max_ = stat.max_; + min_ = stat.min_; + return (*this); + } + // Merge statistics from two sample sets. + inline Self &operator +=(const Self &stat) { + this->Stat1::operator+=(stat); + if (stat.max_ > max_) max_ = stat.max_; + if (stat.min_ < min_) min_ = stat.min_; + return (*this); + } + // Multiply the weight of the set of samples by a factor k + inline Self &operator *=(const VType &stat) { + this->Stat1::operator*=(stat); + return (*this); + } + // Merge statistics from two sample sets. + inline Self operator + (const Self &stat) const { + return Self(*this) += stat; + } + // Multiply the weight of the set of samples by a factor k + inline Self operator * (const VType &k) const { + return Self(*this) *= k; + } + private: + // The - operation makes no sense with Min/Max + // unless we keep the full list of values (but we don't) + // make it private, and let it undefined so nobody can call it + Self &operator -=(const Self &stat); // senseless. let it undefined. + + // The operation opposite to - + Self operator - (const Self &stat) const; // senseless. let it undefined. + + public: + // Return the maximal value in this sample set + VType Max() const { + return max_; + } + // Return the minimal value in this sample set + VType Min() const { + return min_; + } + private: + // Let i be the index of the samples provided (using +=) + // and weight[i],value[i] be the data of sample #i + // then the variables have the following meaning: + VType max_; // max of value[i] + VType min_; // min of value[i] +}; + +// Useful printing function +template +inline std::ostream& operator <<(std::ostream& out, + const Stat1MinMax& s) { + out << "{ avg = " << s.Mean() + << " std = " << s.StdDev() + << " nsamples = " << s.NumSamples() + << " min = " << s.Min() + << " max = " << s.Max() << "}"; + return out; +} + +#endif // BENCHMARK_STAT_H_ diff --git a/src/sysinfo.cc b/src/sysinfo.cc new file mode 100644 index 00000000..644b66f0 --- /dev/null +++ b/src/sysinfo.cc @@ -0,0 +1,337 @@ +#include "sysinfo.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "cycleclock.h" +#include "macros.h" +#include "mutex_lock.h" +#include "sleep.h" + +namespace { +pthread_once_t cpuinfo_init = PTHREAD_ONCE_INIT; +double cpuinfo_cycles_per_second = 1.0; +int cpuinfo_num_cpus = 1; // Conservative guess +static pthread_mutex_t cputimens_mutex; + +// Helper function estimates cycles/sec by observing cycles elapsed during +// sleep(). Using small sleep time decreases accuracy significantly. +int64_t EstimateCyclesPerSecond(const int estimate_time_ms) { + CHECK(estimate_time_ms > 0); + double multiplier = 1000.0 / (double)estimate_time_ms; // scale by this much + + const int64_t start_ticks = CycleClock::Now(); + SleepForMilliseconds(estimate_time_ms); + const int64_t guess = int64_t(multiplier * (CycleClock::Now() - start_ticks)); + return guess; +} + +// Helper function for reading an int from a file. Returns true if successful +// and the memory location pointed to by value is set to the value read. +bool ReadIntFromFile(const char *file, int *value) { + bool ret = false; + int fd = open(file, O_RDONLY); + if (fd != -1) { + char line[1024]; + char* err; + memset(line, '\0', sizeof(line)); + CHECK(read(fd, line, sizeof(line) - 1)); + const int temp_value = strtol(line, &err, 10); + if (line[0] != '\0' && (*err == '\n' || *err == '\0')) { + *value = temp_value; + ret = true; + } + close(fd); + } + return ret; +} + +void InitializeSystemInfo() { + bool saw_mhz = false; + + // TODO: destroy this + pthread_mutex_init(&cputimens_mutex, NULL); + +#if defined OS_LINUX || defined OS_CYGWIN + char line[1024]; + char* err; + int freq; + + // If the kernel is exporting the tsc frequency use that. There are issues + // where cpuinfo_max_freq cannot be relied on because the BIOS may be + // exporintg an invalid p-state (on x86) or p-states may be used to put the + // processor in a new mode (turbo mode). Essentially, those frequencies + // cannot always be relied upon. The same reasons apply to /proc/cpuinfo as + // well. + if (!saw_mhz && + ReadIntFromFile("/sys/devices/system/cpu/cpu0/tsc_freq_khz", &freq)) { + // The value is in kHz (as the file name suggests). For example, on a + // 2GHz warpstation, the file contains the value "2000000". + cpuinfo_cycles_per_second = freq * 1000.0; + saw_mhz = true; + } + + // If CPU scaling is in effect, we want to use the *maximum* frequency, + // not whatever CPU speed some random processor happens to be using now. + if (!saw_mhz && + ReadIntFromFile("/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq", + &freq)) { + // The value is in kHz. For example, on a 2GHz warpstation, the file + // contains the value "2000000". + cpuinfo_cycles_per_second = freq * 1000.0; + saw_mhz = true; + } + + // Read /proc/cpuinfo for other values, and if there is no cpuinfo_max_freq. + const char* pname = "/proc/cpuinfo"; + int fd = open(pname, O_RDONLY); + if (fd == -1) { + perror(pname); + if (!saw_mhz) { + cpuinfo_cycles_per_second = EstimateCyclesPerSecond(1000); + } + return; // TODO: use generic tester instead? + } + + double bogo_clock = 1.0; + bool saw_bogo = false; + int max_cpu_id = 0; + int num_cpus = 0; + line[0] = line[1] = '\0'; + int chars_read = 0; + do { // we'll exit when the last read didn't read anything + // Move the next line to the beginning of the buffer + const int oldlinelen = strlen(line); + if (sizeof(line) == oldlinelen + 1) // oldlinelen took up entire line + line[0] = '\0'; + else // still other lines left to save + memmove(line, line + oldlinelen+1, sizeof(line) - (oldlinelen+1)); + // Terminate the new line, reading more if we can't find the newline + char* newline = strchr(line, '\n'); + if (newline == NULL) { + const int linelen = strlen(line); + const int bytes_to_read = sizeof(line)-1 - linelen; + CHECK(bytes_to_read > 0); // because the memmove recovered >=1 bytes + chars_read = read(fd, line + linelen, bytes_to_read); + line[linelen + chars_read] = '\0'; + newline = strchr(line, '\n'); + } + if (newline != NULL) + *newline = '\0'; + + // When parsing the "cpu MHz" and "bogomips" (fallback) entries, we only + // accept postive values. Some environments (virtual machines) report zero, + // which would cause infinite looping in WallTime_Init. + if (!saw_mhz && strncasecmp(line, "cpu MHz", sizeof("cpu MHz")-1) == 0) { + const char* freqstr = strchr(line, ':'); + if (freqstr) { + cpuinfo_cycles_per_second = strtod(freqstr+1, &err) * 1000000.0; + if (freqstr[1] != '\0' && *err == '\0' && cpuinfo_cycles_per_second > 0) + saw_mhz = true; + } + } else if (strncasecmp(line, "bogomips", sizeof("bogomips")-1) == 0) { + const char* freqstr = strchr(line, ':'); + if (freqstr) { + bogo_clock = strtod(freqstr+1, &err) * 1000000.0; + if (freqstr[1] != '\0' && *err == '\0' && bogo_clock > 0) + saw_bogo = true; + } + } else if (strncasecmp(line, "processor", sizeof("processor")-1) == 0) { + num_cpus++; // count up every time we see an "processor :" entry + const char* freqstr = strchr(line, ':'); + if (freqstr) { + const int cpu_id = strtol(freqstr+1, &err, 10); + if (freqstr[1] != '\0' && *err == '\0' && max_cpu_id < cpu_id) + max_cpu_id = cpu_id; + } + } + } while (chars_read > 0); + close(fd); + + if (!saw_mhz) { + if (saw_bogo) { + // If we didn't find anything better, we'll use bogomips, but + // we're not happy about it. + cpuinfo_cycles_per_second = bogo_clock; + } else { + // If we don't even have bogomips, we'll use the slow estimation. + cpuinfo_cycles_per_second = EstimateCyclesPerSecond(1000); + } + } + if (num_cpus == 0) { + fprintf(stderr, "Failed to read num. CPUs correctly from /proc/cpuinfo\n"); + } else { + if ((max_cpu_id + 1) != num_cpus) { + fprintf(stderr, + "CPU ID assignments in /proc/cpuinfo seems messed up." + " This is usually caused by a bad BIOS.\n"); + } + cpuinfo_num_cpus = num_cpus; + } + +#elif defined OS_FREEBSD + // For this sysctl to work, the machine must be configured without + // SMP, APIC, or APM support. hz should be 64-bit in freebsd 7.0 + // and later. Before that, it's a 32-bit quantity (and gives the + // wrong answer on machines faster than 2^32 Hz). See + // http://lists.freebsd.org/pipermail/freebsd-i386/2004-November/001846.html + // But also compare FreeBSD 7.0: + // http://fxr.watson.org/fxr/source/i386/i386/tsc.c?v=RELENG70#L223 + // 231 error = sysctl_handle_quad(oidp, &freq, 0, req); + // To FreeBSD 6.3 (it's the same in 6-STABLE): + // http://fxr.watson.org/fxr/source/i386/i386/tsc.c?v=RELENG6#L131 + // 139 error = sysctl_handle_int(oidp, &freq, sizeof(freq), req); +#if __FreeBSD__ >= 7 + uint64_t hz = 0; +#else + unsigned int hz = 0; +#endif + size_t sz = sizeof(hz); + const char *sysctl_path = "machdep.tsc_freq"; + if ( sysctlbyname(sysctl_path, &hz, &sz, NULL, 0) != 0 ) { + fprintf(stderr, "Unable to determine clock rate from sysctl: %s: %s\n", + sysctl_path, strerror(errno)); + cpuinfo_cycles_per_second = EstimateCyclesPerSecond(1000); + } else { + cpuinfo_cycles_per_second = hz; + } + // TODO: also figure out cpuinfo_num_cpus + +#elif defined OS_WINDOWS +# pragma comment(lib, "shlwapi.lib") // for SHGetValue() + // In NT, read MHz from the registry. If we fail to do so or we're in win9x + // then make a crude estimate. + OSVERSIONINFO os; + os.dwOSVersionInfoSize = sizeof(os); + DWORD data, data_size = sizeof(data); + if (GetVersionEx(&os) && + os.dwPlatformId == VER_PLATFORM_WIN32_NT && + SUCCEEDED(SHGetValueA(HKEY_LOCAL_MACHINE, + "HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0", + "~MHz", NULL, &data, &data_size))) + cpuinfo_cycles_per_second = (int64)data * (int64)(1000 * 1000); // was mhz + else + cpuinfo_cycles_per_second = EstimateCyclesPerSecond(500); // TODO <500? + // TODO: also figure out cpuinfo_num_cpus + +#elif defined OS_MACOSX + // returning "mach time units" per second. the current number of elapsed + // mach time units can be found by calling uint64 mach_absolute_time(); + // while not as precise as actual CPU cycles, it is accurate in the face + // of CPU frequency scaling and multi-cpu/core machines. + // Our mac users have these types of machines, and accuracy + // (i.e. correctness) trumps precision. + // See cycleclock.h: CycleClock::Now(), which returns number of mach time + // units on Mac OS X. + mach_timebase_info_data_t timebase_info; + mach_timebase_info(&timebase_info); + double mach_time_units_per_nanosecond = + static_cast(timebase_info.denom) / + static_cast(timebase_info.numer); + cpuinfo_cycles_per_second = mach_time_units_per_nanosecond * 1e9; + + int num_cpus = 0; + size_t size = sizeof(num_cpus); + int numcpus_name[] = { CTL_HW, HW_NCPU }; + if (::sysctl(numcpus_name, arraysize(numcpus_name), &num_cpus, &size, 0, 0) + == 0 + && (size == sizeof(num_cpus))) + cpuinfo_num_cpus = num_cpus; + +#else + // Generic cycles per second counter + cpuinfo_cycles_per_second = EstimateCyclesPerSecond(1000); +#endif +} +} // end namespace + +#ifndef OS_WINDOWS +// getrusage() based implementation of MyCPUUsage +static double MyCPUUsageRUsage() { + struct rusage ru; + if (getrusage(RUSAGE_SELF, &ru) == 0) { + return (static_cast(ru.ru_utime.tv_sec) + + static_cast(ru.ru_utime.tv_usec)*1e-6 + + static_cast(ru.ru_stime.tv_sec) + + static_cast(ru.ru_stime.tv_usec)*1e-6); + } else { + return 0.0; + } +} + +static bool MyCPUUsageCPUTimeNsLocked(double *cputime) { + static int cputime_fd = -1; + if (cputime_fd == -1) { + cputime_fd = open("/proc/self/cputime_ns", O_RDONLY); + if (cputime_fd < 0) { + cputime_fd = -1; + return false; + } + } + char buff[64]; + memset(buff, 0, sizeof(buff)); + if (pread(cputime_fd, buff, sizeof(buff)-1, 0) <= 0) { + close(cputime_fd); + cputime_fd = -1; + return false; + } + unsigned long long result = strtoull(buff, NULL, 0); + if (result == (std::numeric_limits::max)()) { + close(cputime_fd); + cputime_fd = -1; + return false; + } + *cputime = static_cast(result) / 1e9; + return true; +} + +double MyCPUUsage() { + { + mutex_lock l(&cputimens_mutex); + static bool use_cputime_ns = true; + if (use_cputime_ns) { + double value; + if (MyCPUUsageCPUTimeNsLocked(&value)) { + return value; + } + // Once MyCPUUsageCPUTimeNsLocked fails once fall back to getrusage(). + std::cout << "Reading /proc/self/cputime_ns failed. Using getrusage().\n"; + use_cputime_ns = false; + } + } + return MyCPUUsageRUsage(); +} + +double ChildrenCPUUsage() { + struct rusage ru; + if (getrusage(RUSAGE_CHILDREN, &ru) == 0) { + return (static_cast(ru.ru_utime.tv_sec) + + static_cast(ru.ru_utime.tv_usec)*1e-6 + + static_cast(ru.ru_stime.tv_sec) + + static_cast(ru.ru_stime.tv_usec)*1e-6); + } else { + return 0.0; + } +} +#endif // OS_WINDOWS + +double CyclesPerSecond(void) { + pthread_once(&cpuinfo_init, &InitializeSystemInfo); + return cpuinfo_cycles_per_second; +} + +int NumCPUs(void) { + pthread_once(&cpuinfo_init, &InitializeSystemInfo); + return cpuinfo_num_cpus; +} + diff --git a/src/sysinfo.h b/src/sysinfo.h new file mode 100644 index 00000000..0b85d5cc --- /dev/null +++ b/src/sysinfo.h @@ -0,0 +1,9 @@ +#ifndef BENCHMARK_SYSINFO_H_ +#define BENCHMARK_SYSINFO_H_ + +double MyCPUUsage(); +double ChildrenCPUUsage(); +int NumCPUs(); +double CyclesPerSecond(); + +#endif // BENCHMARK_SYSINFO_H_ diff --git a/src/walltime.cc b/src/walltime.cc new file mode 100644 index 00000000..85384aac --- /dev/null +++ b/src/walltime.cc @@ -0,0 +1,137 @@ +#include "walltime.h" + +#include +#include +#include +#include + +#include +#include + +#include "cycleclock.h" +#include "macros.h" +#include "sysinfo.h" + +namespace walltime { +namespace { +const double kMaxErrorInterval = 100e-6; + +std::atomic initialized(false); +WallTime base_walltime = 0.0; +int64_t base_cycletime = 0; +int64_t cycles_per_second; +double seconds_per_cycle; +uint32_t last_adjust_time = 0; +std::atomic drift_adjust(0); +int64_t max_interval_cycles = 0; + +// Helper routines to load/store a float from an AtomicWord. Required because +// g++ < 4.7 doesn't support std::atomic correctly. I cannot wait to get +// rid of this horror show. +inline void SetDrift(float f) { + int32_t w; + memcpy(&w, &f, sizeof(f)); + std::atomic_store(&drift_adjust, w); +} + +inline float GetDrift() { + float f; + int32_t w = std::atomic_load(&drift_adjust); + memcpy(&f, &w, sizeof(f)); + return f; +} + +static_assert(sizeof(float) <= sizeof(int32_t), + "type sizes don't allow the drift_adjust hack"); + +WallTime Slow() { + struct timeval tv; + gettimeofday(&tv, NULL); + return tv.tv_sec + tv.tv_usec * 1e-6; +} + +bool SplitTimezone(WallTime value, bool local, struct tm* t, + double* subsecond) { + memset(t, 0, sizeof(*t)); + if ((value < 0) || (value > std::numeric_limits::max())) { + *subsecond = 0.0; + return false; + } + const time_t whole_time = static_cast(value); + *subsecond = value - whole_time; + if (local) + localtime_r(&whole_time, t); + else + gmtime_r(&whole_time, t); + return true; +} +} // end namespace + +// This routine should be invoked to initialize walltime. +// It is not intended for general purpose use. +void Initialize() { + CHECK(!std::atomic_load(&initialized)); + cycles_per_second = static_cast(CyclesPerSecond()); + CHECK(cycles_per_second != 0); + seconds_per_cycle = 1.0 / cycles_per_second; + max_interval_cycles = static_cast( + cycles_per_second * kMaxErrorInterval); + do { + base_cycletime = CycleClock::Now(); + base_walltime = Slow(); + } while (CycleClock::Now() - base_cycletime > max_interval_cycles); + // We are now sure that "base_walltime" and "base_cycletime" were produced + // within kMaxErrorInterval of one another. + + SetDrift(0.0); + last_adjust_time = static_cast(uint64_t(base_cycletime) >> 32); + std::atomic_store(&initialized, true); +} + +WallTime Now() { + if (!std::atomic_load(&initialized)) + return Slow(); + + WallTime now = 0.0; + WallTime result = 0.0; + int64_t ct = 0; + uint32_t top_bits = 0; + do { + ct = CycleClock::Now(); + int64_t cycle_delta = ct - base_cycletime; + result = base_walltime + cycle_delta * seconds_per_cycle; + + top_bits = static_cast(uint64_t(ct) >> 32); + // Recompute drift no more often than every 2^32 cycles. + // I.e., @2GHz, ~ every two seconds + if (top_bits == last_adjust_time) { // don't need to recompute drift + return result + GetDrift(); + } + + now = Slow(); + } while (CycleClock::Now() - ct > max_interval_cycles); + // We are now sure that "now" and "result" were produced within + // kMaxErrorInterval of one another. + + SetDrift(now - result); + last_adjust_time = top_bits; + return now; +} + +const char* Print(WallTime time, const char *format, bool local, + char* storage, int *remainder_us) { + struct tm split; + double subsecond; + if (!SplitTimezone(time, local, &split, &subsecond)) { + snprintf(storage, sizeof(storage), "Invalid time: %f", time); + } else { + if (remainder_us != NULL) { + *remainder_us = static_cast((subsecond * 1000000) + 0.5); + if (*remainder_us > 999999) *remainder_us = 999999; + if (*remainder_us < 0) *remainder_us = 0; + } + strftime(storage, sizeof(storage), format, &split); + } + return storage; +} +} // end namespace walltime diff --git a/src/walltime.h b/src/walltime.h new file mode 100644 index 00000000..13cda806 --- /dev/null +++ b/src/walltime.h @@ -0,0 +1,19 @@ +#ifndef BENCHMARK_WALLTIME_H_ +#define BENCHMARK_WALLTIME_H_ + +typedef double WallTime; + +namespace walltime { +void Initialize(); +WallTime Now(); + +// GIVEN: walltime, generic format string (as understood by strftime), +// a boolean flag specifying if the time is local or UTC (true=local). +// RETURNS: the formatted string. ALSO RETURNS: the storage printbuffer +// passed and the remaining number of microseconds (never printed in +// the string since strftime does not understand it) +const char* Print(WallTime time, const char *format, bool local, + char* storage, int *remainder_us); +} // end namespace walltime + +#endif // BENCHMARK_WALLTIME_H_ diff --git a/test/benchmark_test.cc b/test/benchmark_test.cc new file mode 100644 index 00000000..7c0c6d77 --- /dev/null +++ b/test/benchmark_test.cc @@ -0,0 +1,138 @@ +#include "benchmark/benchmark.h" + +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace { + +int ATTRIBUTE_NOINLINE Factorial(uint32_t n) { + return (n == 1) ? 1 : n * Factorial(n - 1); +} + +double CalculatePi(int depth) { + double pi = 0.0; + for (int i = 0; i < depth; ++i) { + double numerator = static_cast(((i % 2) * 2) - 1); + double denominator = static_cast((2 * i) - 1); + pi += numerator / denominator; + } + return (pi - 1.0) * 4; +} + +std::set ConstructRandomSet(int size) { + std::set s; + for (int i = 0; i < size; ++i) + s.insert(i); + return s; +} + +static std::vector* test_vector = NULL; + +} // end namespace + +#ifdef DEBUG +static void BM_Factorial(benchmark::State& state) { + int fac_42 = 0; + while (state.KeepRunning()) + fac_42 = Factorial(8); + // Prevent compiler optimizations + CHECK(fac_42 != std::numeric_limits::max()); +} +BENCHMARK(BM_Factorial); +#endif + +static void BM_CalculatePiRange(benchmark::State& state) { + double pi = 0.0; + while (state.KeepRunning()) + pi = CalculatePi(state.range_x()); + std::stringstream ss; + ss << pi; + state.SetLabel(ss.str()); +} +BENCHMARK_RANGE(BM_CalculatePiRange, 1, 1024 * 1024); + +static void BM_CalculatePi(benchmark::State& state) { + static const int depth = 1024; + double pi ATTRIBUTE_UNUSED = 0.0; + while (state.KeepRunning()) { + pi = CalculatePi(depth); + } +} +BENCHMARK(BM_CalculatePi)->Threads(8); +BENCHMARK(BM_CalculatePi)->ThreadRange(1, 32); +BENCHMARK(BM_CalculatePi)->ThreadPerCpu(); + +static void BM_SetInsert(benchmark::State& state) { + while (state.KeepRunning()) { + state.PauseTiming(); + std::set data = ConstructRandomSet(state.range_x()); + state.ResumeTiming(); + for (int j = 0; j < state.range_y(); ++j) + data.insert(rand()); + } +} +BENCHMARK(BM_SetInsert)->RangePair(1<<10,8<<10, 1,10); + +template +static void BM_Sequential(benchmark::State& state) { + Q q; + typename Q::value_type v; + while (state.KeepRunning()) + for (int i = state.range_x(); --i; ) + q.push_back(v); + const int64_t items_processed = + static_cast(state.iterations()) * state.range_x(); + state.SetItemsProcessed(items_processed); + state.SetBytesProcessed(items_processed * sizeof(v)); +} +BENCHMARK_TEMPLATE(BM_Sequential, std::vector)->Range(1 << 0, 1 << 10); +BENCHMARK_TEMPLATE(BM_Sequential, std::list)->Range(1 << 0, 1 << 10); + +static void BM_StringCompare(benchmark::State& state) { + std::string s1(state.range_x(), '-'); + std::string s2(state.range_x(), '-'); + int r = 0; + while (state.KeepRunning()) + r |= s1.compare(s2); + // Prevent compiler optimizations + CHECK(r != std::numeric_limits::max()); +} +BENCHMARK(BM_StringCompare)->Range(1, 1<<20); + +static void BM_SetupTeardown(benchmark::State& state) { + if (state.thread_index == 0) + test_vector = new std::vector(); + while (state.KeepRunning()) + test_vector->push_back(0); + if (state.thread_index == 0) { + delete test_vector; + test_vector = NULL; + } +} +BENCHMARK(BM_SetupTeardown); + +static void BM_LongTest(benchmark::State& state) { + double tracker = 0.0; + while (state.KeepRunning()) + for (int i = 0; i < state.range_x(); ++i) + tracker += i; + CHECK(tracker != 0.0); +} +BENCHMARK(BM_LongTest)->Range(1<<16,1<<28); + +int main(int argc, const char* argv[]) { + benchmark::Initialize(&argc, argv); + + CHECK(Factorial(8) == 40320); + CHECK(CalculatePi(1) == 0.0); + + benchmark::RunSpecifiedBenchmarks(); +} +