From 7a767012f1c423b37069f6d315b97164b5850271 Mon Sep 17 00:00:00 2001
From: Eric Fiselier <eric@efcs.ca>
Date: Thu, 12 Mar 2015 18:03:33 -0400
Subject: [PATCH] Adopt new benchmark timing internals.

This patch adopts a new internal structure for how timings are performed.
Currently every iteration of a benchmark checks to see if it has been running
for an appropriate amount of time. Checking the clock introduces noise into
the timings and this can cause inconsistent output from each benchmark.

Now every iteration of a benchmark only checks an iteration count to see if it
should stop running. The iteration count is determined before hand by testing
the benchmark on a series of increasing iteration counts until a suitable count
is found. This increases the amount of time it takes to run the actual benchmarks
but it also greatly increases the accuracy of the results.

This patch introduces some breaking changes. The notable breaking changes are:
1. Benchmarks run on multiple threads no generate a report per thread. Instead
   only a single report is generated.
2. ::benchmark::UseRealTime() was removed and replaced with State::UseRealTime().
---
 CMakeLists.txt                     |    4 +
 cmake/thread_safety_attributes.cpp |    4 +
 include/benchmark/benchmark.h      |  286 +++---
 src/CMakeLists.txt                 |    5 +-
 src/benchmark.cc                   | 1470 ++++++++++++----------------
 src/mutex.h                        |  142 +++
 src/string_util.cc                 |   10 +-
 src/sysinfo.cc                     |    3 +-
 test/CMakeLists.txt                |    3 +
 test/basic_test.cc                 |  105 ++
 test/benchmark_test.cc             |   19 +-
 11 files changed, 1044 insertions(+), 1007 deletions(-)
 create mode 100644 cmake/thread_safety_attributes.cpp
 create mode 100644 src/mutex.h
 create mode 100644 test/basic_test.cc
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2526faf4..4296b235 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -44,6 +44,10 @@ add_cxx_compiler_flag(-pedantic-errors)
 add_cxx_compiler_flag(-fno-strict-aliasing RELEASE)
 
 add_cxx_compiler_flag(-Wthread-safety)
+if (HAVE_WTHREAD_SAFETY)
+  add_definitions(-DHAVE_WTHREAD_SAFETY)
+  cxx_feature_check(THREAD_SAFETY_ATTRIBUTES)
+endif()
 
 # C++ feature checks
 cxx_feature_check(STD_REGEX)
diff --git a/cmake/thread_safety_attributes.cpp b/cmake/thread_safety_attributes.cpp
new file mode 100644
index 00000000..46161bab
--- /dev/null
+++ b/cmake/thread_safety_attributes.cpp
@@ -0,0 +1,4 @@
+#define HAVE_THREAD_SAFETY_ATTRIBUTES
+#include "../src/mutex.h"
+
+int main() {}
diff --git a/include/benchmark/benchmark.h b/include/benchmark/benchmark.h
index 5da915ea..50f27f0d 100644
--- a/include/benchmark/benchmark.h
+++ b/include/benchmark/benchmark.h
@@ -135,7 +135,8 @@ BENCHMARK(BM_MultiThreaded)->Threads(4);
 #ifndef BENCHMARK_BENCHMARK_H_
 #define BENCHMARK_BENCHMARK_H_
 
-#include <stdint.h>
+#include <cassert>
+#include <cstdint>
 
 #include <functional>
 #include <memory>
@@ -153,10 +154,7 @@ void Initialize(int* argc, const char** argv);
 
 // Otherwise, run all benchmarks specified by the --benchmark_filter flag,
 // and exit after running the benchmarks.
-void RunSpecifiedBenchmarks(const BenchmarkReporter* reporter = nullptr);
-
-// ------------------------------------------------------
-// Routines that can be called from within a benchmark
+void RunSpecifiedBenchmarks(const BenchmarkReporter* reporter = NULL);
 
 // If this routine is called, peak memory allocation past this point in the
 // benchmark is reported at the end of the benchmark report line. (It is
@@ -165,14 +163,6 @@ void RunSpecifiedBenchmarks(const BenchmarkReporter* reporter = nullptr);
 // TODO(dominic)
 // void MemoryUsage();
 
-// If a particular benchmark is I/O bound, or if for some reason CPU
-// timings are not representative, call this method from within the
-// benchmark routine.  If called, the elapsed time will be used to
-// control how many iterations are run, and in the printing of
-// items/second or MB/seconds values.  If not called, the cpu time
-// used by the benchmark will be used.
-void UseRealTime();
-
 namespace internal {
 class Benchmark;
 class BenchmarkFamilies;
@@ -181,13 +171,63 @@ class BenchmarkFamilies;
 // State is passed to a running Benchmark and contains state for the
 // benchmark to use.
 class State {
- public:
-  // Returns true iff the benchmark should continue through another iteration.
-  bool KeepRunning();
+public:
+  State(size_t max_iters, bool has_x, int x, bool has_y, int y, int thread_i);
 
+  // Returns true iff the benchmark should continue through another iteration.
+  // NOTE: A benchmark may not return from the test until KeepRunning() has
+  // returned false.
+  bool KeepRunning() {
+    if (BENCHMARK_BUILTIN_EXPECT(!started_, false)) {
+        ResumeTiming();
+        started_ = true;
+    }
+    bool const res = total_iterations_++ < max_iterations;
+    if (BENCHMARK_BUILTIN_EXPECT(!res, false)) {
+        assert(started_);
+        PauseTiming();
+        // Total iterations now is one greater than max iterations. Fix this.
+        total_iterations_ = max_iterations;
+    }
+    return res;
+  }
+
+  // REQUIRES: timer is running
+  // Stop the benchmark timer.  If not called, the timer will be
+  // automatically stopped after KeepRunning() returns false for the first time.
+  //
+  // For threaded benchmarks the PauseTiming() function acts
+  // like a barrier.  I.e., the ith call by a particular thread to this
+  // function will block until all threads have made their ith call.
+  // The timer will stop when the last thread has called this function.
+  //
+  // NOTE: PauseTiming()/ResumeTiming() are relatively
+  // heavyweight, and so their use should generally be avoided
+  // within each benchmark iteration, if possible.
   void PauseTiming();
+
+  // REQUIRES: timer is not running
+  // Start the benchmark timer.  The timer is NOT running on entrance to the
+  // benchmark function. It begins running after the first call to KeepRunning()
+  //
+  // For threaded benchmarks the ResumeTiming() function acts
+  // like a barrier.  I.e., the ith call by a particular thread to this
+  // function will block until all threads have made their ith call.
+  // The timer will start when the last thread has called this function.
+  //
+  // NOTE: PauseTiming()/ResumeTiming() are relatively
+  // heavyweight, and so their use should generally be avoided
+  // within each benchmark iteration, if possible.
   void ResumeTiming();
 
+  // If a particular benchmark is I/O bound, or if for some reason CPU
+  // timings are not representative, call this method from within the
+  // benchmark routine.  If called, the elapsed time will be used to
+  // control how many iterations are run, and in the printing of
+  // items/second or MB/seconds values.  If not called, the cpu time
+  // used by the benchmark will be used.
+  void UseRealTime();
+
   // Set the number of bytes processed by the current benchmark
   // execution.  This routine is typically called once at the end of a
   // throughput oriented benchmark.  If this routine is called with a
@@ -195,7 +235,15 @@ class State {
   // per iteration.
   //
   // REQUIRES: a benchmark has exited its KeepRunning loop.
-  void SetBytesProcessed(int64_t bytes);
+  BENCHMARK_ALWAYS_INLINE
+  void SetBytesProcessed(size_t bytes) {
+    bytes_processed_ = bytes;
+  }
+
+  BENCHMARK_ALWAYS_INLINE
+  size_t bytes_processed() const {
+    return bytes_processed_;
+  }
 
   // If this routine is called with items > 0, then an items/s
   // label is printed on the benchmark report line for the currently
@@ -203,94 +251,76 @@ class State {
   // benchmark where a processing items/second output is desired.
   //
   // REQUIRES: a benchmark has exited its KeepRunning loop.
-  void SetItemsProcessed(int64_t items);
+  BENCHMARK_ALWAYS_INLINE
+  void SetItemsProcessed(size_t items) {
+    items_processed_ = items;
+  }
+
+  BENCHMARK_ALWAYS_INLINE
+  size_t items_processed() const {
+    return items_processed_;
+  }
 
   // If this routine is called, the specified label is printed at the
   // end of the benchmark report line for the currently executing
   // benchmark.  Example:
-  //  static void BM_Compress(benchmark::State& state) {
+  //  static void BM_Compress(int iters) {
   //    ...
   //    double compress = input_size / output_size;
-  //    state.SetLabel(StringPrintf("compress:%.1f%%", 100.0*compression));
+  //    benchmark::SetLabel(StringPrintf("compress:%.1f%%", 100.0*compression));
   //  }
   // Produces output that looks like:
   //  BM_Compress   50         50   14115038  compress:27.3%
   //
   // REQUIRES: a benchmark has exited its KeepRunning loop.
-  void SetLabel(const std::string& label);
+  void SetLabel(const char* label);
+
+  // Allow the use of std::string without actually including <string>.
+  // This function does not participate in overload resolution unless StringType
+  // has the nested typename `basic_string`. This typename should be provided
+  // as an injected class name in the case of std::string.
+  template <class StringType>
+  void SetLabel(StringType const & str,
+                typename StringType::basic_string* = 0) {
+    this->SetLabel(str.c_str());
+  }
 
   // Range arguments for this run. CHECKs if the argument has been set.
-  int range_x() const;
-  int range_y() const;
+  BENCHMARK_ALWAYS_INLINE
+  int range_x() const {
+    assert(has_range_x_);
+    ((void)has_range_x_); // Prevent unused warning.
+    return range_x_;
+  }
 
-  int64_t iterations() const { return total_iterations_; }
+  BENCHMARK_ALWAYS_INLINE
+  int range_y() const {
+    assert(has_range_y_);
+    ((void)has_range_y_); // Prevent unused warning.
+    return range_y_;
+  }
 
+  BENCHMARK_ALWAYS_INLINE
+  size_t iterations() const { return total_iterations_; }
+
+private:
+  bool started_;
+  size_t total_iterations_;
+
+  bool has_range_x_;
+  int range_x_;
+
+  bool has_range_y_;
+  int range_y_;
+
+  size_t bytes_processed_;
+  size_t items_processed_;
+
+public:
   const int thread_index;
+  const size_t max_iterations;
 
- private:
-  class FastClock;
-  struct SharedState;
-  struct ThreadStats;
-
-  State(FastClock* clock, SharedState* s, int t);
-  bool StartRunning();
-  bool FinishInterval();
-  bool MaybeStop();
-  void NewInterval();
-  bool AllStarting();
-
-  static void* RunWrapper(void* arg);
-  void Run();
-  void RunAsThread();
-  void Wait();
-
-  enum EState {
-    STATE_INITIAL,   // KeepRunning hasn't been called
-    STATE_STARTING,  // KeepRunning called, waiting for other threads
-    STATE_RUNNING,   // Running and being timed
-    STATE_STOPPING,  // Not being timed but waiting for other threads
-    STATE_STOPPED    // Stopped
-  };
-
-  EState state_;
-
-  FastClock* clock_;
-
-  // State shared by all BenchmarkRun objects that belong to the same
-  // BenchmarkInstance
-  SharedState* shared_;
-
-  std::thread thread_;
-
-  // Custom label set by the user.
-  std::string label_;
-
-  // Each State object goes through a sequence of measurement intervals. By
-  // default each interval is approx. 100ms in length. The following stats are
-  // kept for each interval.
-  int64_t iterations_;
-  double start_cpu_;
-  double start_time_;
-  int64_t stop_time_micros_;
-
-  double start_pause_cpu_;
-  double pause_cpu_time_;
-  double start_pause_real_;
-  double pause_real_time_;
-
-  // Total number of iterations for all finished runs.
-  int64_t total_iterations_;
-
-  // Approximate time in microseconds for one interval of execution.
-  // Dynamically adjusted as needed.
-  int64_t interval_micros_;
-
-  // True if the current interval is the continuation of a previous one.
-  bool is_continuation_;
-
-  std::unique_ptr<ThreadStats> stats_;
-
-  friend class internal::Benchmark;
+private:
   BENCHMARK_DISALLOW_COPY_AND_ASSIGN(State);
 };
 
@@ -304,7 +334,6 @@ class BenchmarkReporter {
   struct Context {
     int num_cpus;
     double mhz_per_cpu;
-    // std::string cpu_info;
     bool cpu_scaling_enabled;
 
     // The number of chars in the longest benchmark name.
@@ -312,19 +341,17 @@ class BenchmarkReporter {
   };
 
   struct Run {
-    Run()
-        : thread_index(-1),
-          iterations(1),
-          real_accumulated_time(0),
-          cpu_accumulated_time(0),
-          bytes_per_second(0),
-          items_per_second(0),
-          max_heapbytes_used(0) {}
+    Run() :
+      iterations(1),
+      real_accumulated_time(0),
+      cpu_accumulated_time(0),
+      bytes_per_second(0),
+      items_per_second(0),
+      max_heapbytes_used(0) {}
 
     std::string benchmark_name;
-    std::string report_label;
-    int thread_index;
-    int64_t iterations;
+    std::string report_label;  // Empty if not set by benchmark.
+    size_t iterations;
     double real_accumulated_time;
     double cpu_accumulated_time;
 
@@ -350,22 +377,12 @@ class BenchmarkReporter {
   // benchmark, thus have the same name.
   virtual void ReportRuns(const std::vector<Run>& report) const = 0;
 
-  virtual ~BenchmarkReporter() {}
+  virtual ~BenchmarkReporter();
 };
 
 namespace internal {
 
-typedef std::function<void(State&)> BenchmarkFunction;
-
-// Run all benchmarks whose name is a partial match for the regular
-// expression in "spec". The results of benchmark runs are fed to "reporter".
-void RunMatchingBenchmarks(const std::string& spec,
-                           const BenchmarkReporter* reporter);
-
-// Extract the list of benchmark names that match the specified regular
-// expression.
-void FindMatchingBenchmarkNames(const std::string& re,
-                                std::vector<std::string>* benchmark_names);
+typedef void(Function)(State&);
 
 // ------------------------------------------------------
 // Benchmark registration object.  The BENCHMARK() macro expands
@@ -375,8 +392,7 @@ void FindMatchingBenchmarkNames(const std::string& re,
 // chained into one expression.
 class Benchmark {
  public:
-  // The Benchmark takes ownership of the Callback pointed to by f.
-  Benchmark(const char* name, BenchmarkFunction f);
+  Benchmark(const char* name, Function* f);
 
   ~Benchmark();
 
@@ -444,40 +460,25 @@ class Benchmark {
   // Used inside the benchmark implementation
   struct Instance;
 
-  // Measure the overhead of an empty benchmark to subtract later.
-  static void MeasureOverhead();
-
  private:
-  friend class BenchmarkFamilies;
-
-  std::vector<Benchmark::Instance> CreateBenchmarkInstances(size_t rangeXindex,
-                                                            size_t rangeYindex);
-
   std::string name_;
-  BenchmarkFunction function_;
-  size_t registration_index_;
-  std::vector<int> rangeX_;
-  std::vector<int> rangeY_;
+  Function* function_;
+  std::size_t registration_index_;
+  int arg_count_;
+  std::vector< std::pair<int, int> > args_;  // Args for all benchmark runs
   std::vector<int> thread_counts_;
-  std::mutex mutex_;
 
   // Special value placed in thread_counts_ to stand for NumCPUs()
   static const int kNumCpuMarker = -1;
 
-  // Special value used to indicate that no range is required.
-  static const size_t kNoRangeIndex = std::numeric_limits<size_t>::max();
-  static const int kNoRange = std::numeric_limits<int>::max();
-
   static void AddRange(std::vector<int>* dst, int lo, int hi, int mult);
-  static double MeasurePeakHeapMemory(const Instance& b);
-  static void RunInstance(const Instance& b, const BenchmarkReporter* br);
-  friend class ::benchmark::State;
-  friend struct ::benchmark::internal::Benchmark::Instance;
-  friend void ::benchmark::internal::RunMatchingBenchmarks(
-      const std::string&, const BenchmarkReporter*);
+
+  friend class BenchmarkFamilies;
+
   BENCHMARK_DISALLOW_COPY_AND_ASSIGN(Benchmark);
 };
 
+
 // ------------------------------------------------------
 // Internal implementation details follow; please ignore
 
@@ -487,16 +488,16 @@ class ConsoleReporter : public BenchmarkReporter {
  public:
   virtual bool ReportContext(const Context& context) const;
   virtual void ReportRuns(const std::vector<Run>& reports) const;
-
  private:
-  std::string PrintMemoryUsage(double bytes) const;
   virtual void PrintRunData(const Run& report) const;
+  // TODO(ericwf): Find a better way to share this information.
   mutable size_t name_field_width_;
 };
 
 }  // end namespace internal
 }  // end namespace benchmark
 
+
 // ------------------------------------------------------
 // Macro to register benchmarks
 
@@ -534,4 +535,11 @@ class ConsoleReporter : public BenchmarkReporter {
       __benchmark_, n, __LINE__) BENCHMARK_UNUSED =          \
       (new ::benchmark::internal::Benchmark(#n "<" #a "," #b ">", n<a, b>))
 
+// Helper macro to create a main routine in a test that runs the benchmarks
+#define BENCHMARK_MAIN()                             \
+  int main(int argc, const char** argv) {            \
+    ::benchmark::Initialize(&argc, argv);            \
+    ::benchmark::RunSpecifiedBenchmarks();           \
+  }
+
 #endif  // BENCHMARK_BENCHMARK_H_
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 5f22510d..f3a825f2 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -2,9 +2,8 @@
 include_directories(${PROJECT_SOURCE_DIR}/src)
 
 # Define the source files
-set(SOURCE_FILES "benchmark.cc" "colorprint.cc" "commandlineflags.cc"
-                 "log.cc" "sleep.cc" "string_util.cc" "sysinfo.cc"
-                 "walltime.cc")
+set(SOURCE_FILES "benchmark.cc" "colorprint.cc" "commandlineflags.cc" "log.cc"
+                 "sleep.cc" "string_util.cc" "sysinfo.cc" "walltime.cc")
 # Determine the correct regular expression engine to use
 if(HAVE_STD_REGEX)
   set(RE_FILES "re_std.cc")
diff --git a/src/benchmark.cc b/src/benchmark.cc
index d4f6f1b3..8b0682e6 100644
--- a/src/benchmark.cc
+++ b/src/benchmark.cc
@@ -13,30 +13,30 @@
 // limitations under the License.
 
 #include "benchmark/benchmark.h"
-#include "arraysize.h"
-#include "check.h"
-#include "colorprint.h"
-#include "commandlineflags.h"
-#include "internal_macros.h"
-#include "log.h"
-#include "re.h"
-#include "sleep.h"
-#include "stat.h"
-#include "string_util.h"
-#include "sysinfo.h"
-#include "walltime.h"
 
 #include <sys/time.h>
-#include <string.h>
+#include <sys/resource.h>
+#include <unistd.h>
 
+#include <cstdlib>
+#include <cstring>
 #include <algorithm>
 #include <atomic>
 #include <condition_variable>
 #include <iostream>
 #include <memory>
-#include <mutex>
 #include <thread>
-#include <sstream>
+
+#include "check.h"
+#include "commandlineflags.h"
+#include "colorprint.h"
+#include "log.h"
+#include "mutex.h"
+#include "re.h"
+#include "stat.h"
+#include "string_util.h"
+#include "sysinfo.h"
+#include "walltime.h"
 
 DEFINE_string(benchmark_filter, ".",
               "A regular expression that specifies the set of benchmarks "
@@ -57,124 +57,121 @@ DEFINE_double(benchmark_min_time, 0.5,
               "of the benchmark execution, regardless of number of "
               "threads.");
 
-DEFINE_bool(benchmark_memory_usage, false,
-            "Report memory usage for all benchmarks");
-
 DEFINE_int32(benchmark_repetitions, 1,
              "The number of runs of each benchmark. If greater than 1, the "
              "mean and standard deviation of the runs will be reported.");
 
-DEFINE_int32(v, 0, "The level of verbose logging to output");
 DEFINE_bool(color_print, true, "Enables colorized logging.");
 
-// Will be non-empty if heap checking is turned on, which would
-// invalidate any benchmarks.
-DECLARE_string(heap_check);
+DEFINE_int32(v, 0, "The level of verbose logging to output");
+
 
 // The ""'s catch people who don't pass in a literal for "str"
 #define strliterallen(str) (sizeof("" str "") - 1)
 
 // Must use a string literal for prefix.
-#define memprefix(str, len, prefix)                  \
-  ((((len) >= strliterallen(prefix)) &&              \
-    memcmp(str, prefix, strliterallen(prefix)) == 0) \
-       ? str + strliterallen(prefix)                 \
-       : NULL)
+#define memprefix(str, len, prefix)                       \
+  ((((len) >= strliterallen(prefix)) &&                   \
+    std::memcmp(str, prefix, strliterallen(prefix)) == 0) \
+       ? str + strliterallen(prefix)                      \
+       : nullptr)
+
 
 namespace benchmark {
+
+namespace internal {
+
+// NOTE: This is a dummy "mutex" type used to denote the actual mutex
+// returned by GetBenchmarkLock(). This is only used to placate the thread
+// safety warnings by giving the return of GetBenchmarkLock() a name.
+struct CAPABILITY("mutex") BenchmarkLockType {};
+BenchmarkLockType BenchmarkLockVar;
+
+} // end namespace internal
+
+inline Mutex& RETURN_CAPABILITY(::benchmark::internal::BenchmarkLockVar)
+GetBenchmarkLock()
+{
+  static Mutex lock;
+  return lock;
+}
+
 namespace {
+
 // For non-dense Range, intermediate values are powers of kRangeMultiplier.
 static const int kRangeMultiplier = 8;
-
-std::mutex starting_mutex;
-std::condition_variable starting_cv;
+static const int kMaxIterations = 1000000000;
 
 bool running_benchmark = false;
 
-// Should this benchmark report memory usage?
-bool get_memory_usage;
+// Global variable so that a benchmark can cause a little extra printing
+std::string* GetReportLabel() {
+    static std::string label GUARDED_BY(GetBenchmarkLock());
+    return &label;
+}
 
 // Should this benchmark base decisions off of real time rather than
 // cpu time?
-bool use_real_time;
+bool use_real_time GUARDED_BY(GetBenchmarkLock());
 
-// Overhead of an empty benchmark.
-double overhead = 0.0;
+// TODO(ericwf): support MallocCounter.
+//static benchmark::MallocCounter *benchmark_mc;
 
-// Return prefix to print in front of each reported line
-const char* Prefix() {
-#ifdef NDEBUG
-  return "";
-#else
-  return "DEBUG: ";
-#endif
-}
-
-// TODO
-// static internal::MallocCounter *benchmark_mc;
-
-bool CpuScalingEnabled() {
+static bool CpuScalingEnabled() {
   // On Linux, the CPUfreq subsystem exposes CPU information as files on the
   // local file system. If reading the exported files fails, then we may not be
   // running on Linux, so we silently ignore all the read errors.
   for (int cpu = 0, num_cpus = NumCPUs(); cpu < num_cpus; ++cpu) {
-    std::stringstream ss;
-    ss << "/sys/devices/system/cpu/cpu" << cpu << "/cpufreq/scaling_governor";
-    std::string governor_file = ss.str();
+    std::string governor_file = StrCat("/sys/devices/system/cpu/cpu", cpu,
+                                       "/cpufreq/scaling_governor");
     FILE* file = fopen(governor_file.c_str(), "r");
     if (!file) break;
     char buff[16];
     size_t bytes_read = fread(buff, 1, sizeof(buff), file);
     fclose(file);
-    if (memprefix(buff, bytes_read, "performance") == NULL) return true;
+    if (memprefix(buff, bytes_read, "performance") == nullptr) return true;
   }
   return false;
 }
 
-// Given a collection of reports, computes their mean and stddev.
-// REQUIRES: all runs in "reports" must be from the same benchmark.
 void ComputeStats(const std::vector<BenchmarkReporter::Run>& reports,
                   BenchmarkReporter::Run* mean_data,
                   BenchmarkReporter::Run* stddev_data) {
+  CHECK(reports.size() >= 2) << "Cannot compute stats for less than 2 reports";
   // Accumulators.
   Stat1_d real_accumulated_time_stat;
   Stat1_d cpu_accumulated_time_stat;
-  Stat1_d items_per_second_stat;
   Stat1_d bytes_per_second_stat;
-  Stat1_d iterations_stat;
-  Stat1MinMax_d max_heapbytes_used_stat;
+  Stat1_d items_per_second_stat;
+  // All repetitions should be run with the same number of iterations so we
+  // can take this information from the first benchmark.
+  std::size_t const run_iterations = reports.front().iterations;
 
   // Populate the accumulators.
-  for (std::vector<BenchmarkReporter::Run>::const_iterator it = reports.begin();
-       it != reports.end(); ++it) {
-    CHECK_EQ(reports[0].benchmark_name, it->benchmark_name);
+  for (BenchmarkReporter::Run const& run : reports) {
+    CHECK_EQ(reports[0].benchmark_name, run.benchmark_name);
+    CHECK_EQ(run_iterations, run.iterations);
     real_accumulated_time_stat +=
-        Stat1_d(it->real_accumulated_time / it->iterations, it->iterations);
+        Stat1_d(run.real_accumulated_time/run.iterations, run.iterations);
     cpu_accumulated_time_stat +=
-        Stat1_d(it->cpu_accumulated_time / it->iterations, it->iterations);
-    items_per_second_stat += Stat1_d(it->items_per_second, it->iterations);
-    bytes_per_second_stat += Stat1_d(it->bytes_per_second, it->iterations);
-    iterations_stat += Stat1_d(it->iterations, it->iterations);
-    max_heapbytes_used_stat +=
-        Stat1MinMax_d(it->max_heapbytes_used, it->iterations);
+        Stat1_d(run.cpu_accumulated_time/run.iterations, run.iterations);
+    items_per_second_stat += Stat1_d(run.items_per_second, run.iterations);
+    bytes_per_second_stat += Stat1_d(run.bytes_per_second, run.iterations);
   }
 
-  // Get the data from the accumulator to BenchmarkRunData's.  In the
-  // computations below we must multiply by the number of iterations since
-  // PrintRunData will divide by it.
+  // Get the data from the accumulator to BenchmarkReporter::Run's.
   mean_data->benchmark_name = reports[0].benchmark_name + "_mean";
-  mean_data->iterations = iterations_stat.Mean();
+  mean_data->iterations = run_iterations;
   mean_data->real_accumulated_time = real_accumulated_time_stat.Mean() *
-                                     mean_data->iterations;
+                                     run_iterations;
   mean_data->cpu_accumulated_time = cpu_accumulated_time_stat.Mean() *
-                                    mean_data->iterations;
+                                    run_iterations;
   mean_data->bytes_per_second = bytes_per_second_stat.Mean();
   mean_data->items_per_second = items_per_second_stat.Mean();
-  mean_data->max_heapbytes_used = max_heapbytes_used_stat.Max();
 
   // Only add label to mean/stddev if it is same for all runs
   mean_data->report_label = reports[0].report_label;
-  for (size_t i = 1; i < reports.size(); i++) {
+  for (std::size_t i = 1; i < reports.size(); i++) {
     if (reports[i].report_label != reports[0].report_label) {
       mean_data->report_label = "";
       break;
@@ -183,29 +180,166 @@ void ComputeStats(const std::vector<BenchmarkReporter::Run>& reports,
 
   stddev_data->benchmark_name = reports[0].benchmark_name + "_stddev";
   stddev_data->report_label = mean_data->report_label;
-  stddev_data->iterations = iterations_stat.StdDev();
-  // The value of iterations_stat.StdDev() above may be 0 if all the repetitions
-  // have the same number of iterations.  Blindly multiplying by 0 in the
-  // computation of real/cpu_accumulated_time below would lead to 0/0 in
-  // PrintRunData.  So we skip the multiplication in this case and PrintRunData
-  // skips the division.
-  if (stddev_data->iterations == 0) {
-    stddev_data->real_accumulated_time = real_accumulated_time_stat.StdDev();
-    stddev_data->cpu_accumulated_time = cpu_accumulated_time_stat.StdDev();
-  } else {
-    stddev_data->real_accumulated_time = real_accumulated_time_stat.StdDev() *
-                                         stddev_data->iterations;
-    stddev_data->cpu_accumulated_time = cpu_accumulated_time_stat.StdDev() *
-                                        stddev_data->iterations;
-  }
+  stddev_data->iterations = 0;
+  stddev_data->real_accumulated_time =
+      real_accumulated_time_stat.StdDev();
+  stddev_data->cpu_accumulated_time =
+      cpu_accumulated_time_stat.StdDev();
   stddev_data->bytes_per_second = bytes_per_second_stat.StdDev();
   stddev_data->items_per_second = items_per_second_stat.StdDev();
-  stddev_data->max_heapbytes_used = max_heapbytes_used_stat.StdDev();
 }
-}  // namespace
+
+struct ThreadStats {
+    ThreadStats() : bytes_processed(0), items_processed(0) {}
+    int64_t bytes_processed;
+    int64_t items_processed;
+};
+
+// Timer management class
+class TimerManager {
+ public:
+  TimerManager(int num_threads, Notification* done)
+      : num_threads_(num_threads),
+        done_(done),
+        running_(false),
+        real_time_used_(0),
+        cpu_time_used_(0),
+        num_finalized_(0),
+        phase_number_(0),
+        entered_(0) {
+  }
+
+  // Called by each thread
+  void StartTimer() EXCLUDES(lock_) {
+    bool last_thread = false;
+    {
+      MutexLock ml(lock_);
+      last_thread = Barrier(ml);
+      if (last_thread) {
+        CHECK(!running_) << "Called StartTimer when timer is already running";
+        running_ = true;
+        start_real_time_ = walltime::Now();
+        start_cpu_time_ = MyCPUUsage() + ChildrenCPUUsage();
+       }
+     }
+     if (last_thread) {
+       phase_condition_.notify_all();
+     }
+  }
+
+  // Called by each thread
+  void StopTimer() EXCLUDES(lock_) {
+    bool last_thread = false;
+    {
+      MutexLock ml(lock_);
+      last_thread = Barrier(ml);
+      if (last_thread) {
+        CHECK(running_) << "Called StopTimer when timer is already stopped";
+        InternalStop();
+      }
+    }
+    if (last_thread) {
+      phase_condition_.notify_all();
+    }
+  }
+
+  // Called by each thread
+  void Finalize() EXCLUDES(lock_) {
+    MutexLock l(lock_);
+    num_finalized_++;
+    if (num_finalized_ == num_threads_) {
+      CHECK(!running_) <<
+        "The timer should be stopped before the timer is finalized";
+      done_->Notify();
+    }
+  }
+
+  // REQUIRES: timer is not running
+  double real_time_used() EXCLUDES(lock_) {
+    MutexLock l(lock_);
+    CHECK(!running_);
+    return real_time_used_;
+  }
+
+  // REQUIRES: timer is not running
+  double cpu_time_used() EXCLUDES(lock_) {
+    MutexLock l(lock_);
+    CHECK(!running_);
+    return cpu_time_used_;
+  }
+
+ private:
+  Mutex lock_;
+  Condition phase_condition_;
+  int num_threads_;
+  Notification* done_;
+
+  bool running_;                // Is the timer running
+  double start_real_time_;      // If running_
+  double start_cpu_time_;       // If running_
+
+  // Accumulated time so far (does not contain current slice if running_)
+  double real_time_used_;
+  double cpu_time_used_;
+
+  // How many threads have called Finalize()
+  int num_finalized_;
+
+  // State for barrier management
+  int phase_number_;
+  int entered_;         // Number of threads that have entered this barrier
+
+  void InternalStop() REQUIRES(lock_) {
+    CHECK(running_);
+    running_ = false;
+    real_time_used_ += walltime::Now() - start_real_time_;
+    cpu_time_used_ += ((MyCPUUsage() + ChildrenCPUUsage())
+                       - start_cpu_time_);
+  }
+
+  // Enter the barrier and wait until all other threads have also
+  // entered the barrier.  Returns iff this is the last thread to
+  // enter the barrier.
+  bool Barrier(MutexLock& ml) REQUIRES(lock_) {
+    CHECK_LT(entered_, num_threads_);
+    entered_++;
+    if (entered_ < num_threads_) {
+      // Wait for all threads to enter
+      int phase_number_cp = phase_number_;
+      auto cb = [this, phase_number_cp]() {
+        return this->phase_number_ > phase_number_cp;
+      };
+      phase_condition_.wait(ml.native_handle(), cb);
+      return false;  // I was not the last one
+    } else {
+      // Last thread has reached the barrier
+      phase_number_++;
+      entered_ = 0;
+      return true;
+    }
+  }
+};
+
+// TimerManager for current run.
+static std::unique_ptr<TimerManager> timer_manager = nullptr;
+
+} // end namespace
 
 namespace internal {
 
+// Information kept per benchmark we may want to run
+struct Benchmark::Instance {
+  std::string   name;
+  Function*     function;
+  bool          has_arg1;
+  int           arg1;
+  bool          has_arg2;
+  int           arg2;
+  int           threads;    // Number of concurrent threads to use
+  bool          multithreaded;  // Is benchmark multi-threaded?
+};
+
+
 // Class for managing registered benchmarks.  Note that each registered
 // benchmark identifies a family of related benchmarks to run.
 class BenchmarkFamilies {
@@ -220,16 +354,17 @@ class BenchmarkFamilies {
 
   // Extract the list of benchmark instances that match the specified
   // regular expression.
-  void FindBenchmarks(const std::string& re,
+  bool FindBenchmarks(const std::string& re,
                       std::vector<Benchmark::Instance>* benchmarks);
  private:
   BenchmarkFamilies();
   ~BenchmarkFamilies();
 
   std::vector<Benchmark*> families_;
-  std::mutex mutex_;
+  Mutex mutex_;
 };
 
+
 BenchmarkFamilies* BenchmarkFamilies::GetInstance() {
   static BenchmarkFamilies instance;
   return &instance;
@@ -244,7 +379,7 @@ BenchmarkFamilies::~BenchmarkFamilies() {
 }
 
 size_t BenchmarkFamilies::AddBenchmark(Benchmark* family) {
-  std::lock_guard<std::mutex> l(mutex_);
+  MutexLock l(mutex_);
   // This loop attempts to reuse an entry that was previously removed to avoid
   // unncessary growth of the vector.
   for (size_t index = 0; index < families_.size(); ++index) {
@@ -259,392 +394,133 @@ size_t BenchmarkFamilies::AddBenchmark(Benchmark* family) {
 }
 
 void BenchmarkFamilies::RemoveBenchmark(size_t index) {
-  std::lock_guard<std::mutex> l(mutex_);
-  families_[index] = NULL;
+  MutexLock l(mutex_);
+  families_[index] = nullptr;
   // Don't shrink families_ here, we might be called by the destructor of
   // BenchmarkFamilies which iterates over the vector.
 }
 
-void BenchmarkFamilies::FindBenchmarks(
+bool BenchmarkFamilies::FindBenchmarks(
     const std::string& spec,
     std::vector<Benchmark::Instance>* benchmarks) {
   // Make regular expression out of command-line flag
+  std::string error_msg;
   Regex re;
-  std::string re_error;
-  if (!re.Init(spec, &re_error)) {
-    std::cerr << "Could not compile benchmark re: " << re_error << std::endl;
-    return;
+  if (!re.Init(spec, &error_msg)) {
+    std::cerr << "Could not compile benchmark re: " << error_msg << std::endl;
+    return false;
   }
 
-  std::lock_guard<std::mutex> l(mutex_);
-  for (internal::Benchmark* family : families_) {
-    if (family == nullptr) continue;  // Family was deleted
+  // Special list of thread counts to use when none are specified
+  std::vector<int> one_thread;
+  one_thread.push_back(1);
 
-    // Match against filter.
-    if (!re.Match(family->name_)) {
-      VLOG(1) << "Skipping " << family->name_ << "\n";
-      continue;
+  MutexLock l(mutex_);
+  for (Benchmark* family : families_) {
+    // Family was deleted or benchmark doesn't match
+    if (family == nullptr || !re.Match(family->name_)) continue;
+
+    if (family->arg_count_ == -1) {
+      family->arg_count_ = 0;
+      family->args_.emplace_back(-1, -1);
     }
+    for (auto const& args : family->args_) {
+      const std::vector<int>* thread_counts =
+        (family->thread_counts_.empty()
+         ? &one_thread
+         : &family->thread_counts_);
+      for (int num_threads : *thread_counts) {
 
-    std::vector<Benchmark::Instance> instances;
-    if (family->rangeX_.empty() && family->rangeY_.empty()) {
-      instances = family->CreateBenchmarkInstances(
-        Benchmark::kNoRangeIndex, Benchmark::kNoRangeIndex);
-      std::copy(instances.begin(), instances.end(),
-                std::back_inserter(*benchmarks));
-    } else if (family->rangeY_.empty()) {
-      for (size_t x = 0; x < family->rangeX_.size(); ++x) {
-        instances = family->CreateBenchmarkInstances(
-          x, Benchmark::kNoRangeIndex);
-        std::copy(instances.begin(), instances.end(),
-                  std::back_inserter(*benchmarks));
-      }
-    } else {
-      for (size_t x = 0; x < family->rangeX_.size(); ++x) {
-        for (size_t y = 0; y < family->rangeY_.size(); ++y) {
-          instances = family->CreateBenchmarkInstances(x, y);
-          std::copy(instances.begin(), instances.end(),
-                    std::back_inserter(*benchmarks));
+        Benchmark::Instance instance;
+        instance.name = family->name_;
+        instance.function = family->function_;
+        instance.has_arg1 = family->arg_count_ >= 1;
+        instance.arg1 = args.first;
+        instance.has_arg2 = family->arg_count_ == 2;
+        instance.arg2 = args.second;
+        instance.threads = num_threads;
+        instance.multithreaded = !(family->thread_counts_.empty());
+
+        // Add arguments to instance name
+        if (family->arg_count_ >= 1) {
+          AppendHumanReadable(instance.arg1, &instance.name);
         }
+        if (family->arg_count_ >= 2) {
+          AppendHumanReadable(instance.arg2, &instance.name);
+        }
+
+        // Add the number of threads used to the name
+        if (!family->thread_counts_.empty()) {
+          instance.name += StringPrintF("/threads:%d", instance.threads);
+        }
+
+        benchmarks->push_back(instance);
       }
     }
   }
-}
-
-std::string ConsoleReporter::PrintMemoryUsage(double bytes) const {
-  if (!get_memory_usage || bytes < 0.0) return "";
-
-  std::stringstream ss;
-  ss << " " << HumanReadableNumber(bytes) << "B peak-mem";
-  return ss.str();
-}
-
-bool ConsoleReporter::ReportContext(const BenchmarkReporter::Context& context)
-    const {
-  name_field_width_ = context.name_field_width;
-
-  std::cout << "Benchmarking on " << context.num_cpus << " X "
-            << context.mhz_per_cpu << " MHz CPU"
-            << ((context.num_cpus > 1) ? "s" : "") << "\n";
-
-  int remainder_ms;
-  std::cout << walltime::Print(walltime::Now(), "%Y/%m/%d-%H:%M:%S",
-                               true,  // use local timezone
-                               &remainder_ms) << "\n";
-
-  // Show details of CPU model, caches, TLBs etc.
-  //  if (!context.cpu_info.empty())
-  //    std::cout << "CPU: " << context.cpu_info.c_str();
-
-  if (context.cpu_scaling_enabled) {
-    std::cerr << "CPU scaling is enabled: Benchmark timings may be noisy.\n";
-  }
-
-  int output_width = fprintf(stdout, "%s%-*s %10s %10s %10s\n",
-                             Prefix(), int(name_field_width_), "Benchmark",
-                             "Time(ns)", "CPU(ns)", "Iterations");
-  std::cout << std::string(output_width - 1, '-').c_str() << "\n";
-
   return true;
 }
 
-void ConsoleReporter::ReportRuns(
-    const std::vector<BenchmarkReporter::Run>& reports) const {
-  for (std::vector<BenchmarkReporter::Run>::const_iterator it = reports.begin();
-       it != reports.end(); ++it) {
-    CHECK_EQ(reports[0].benchmark_name, it->benchmark_name);
-    PrintRunData(*it);
-  }
 
-  // We don't report aggregated data if there was a single run.
-  if (reports.size() < 2) return;
-
-  BenchmarkReporter::Run mean_data;
-  BenchmarkReporter::Run stddev_data;
-  ComputeStats(reports, &mean_data, &stddev_data);
-
-  PrintRunData(mean_data);
-  PrintRunData(stddev_data);
-}
-
-void ConsoleReporter::PrintRunData(const BenchmarkReporter::Run& result) const {
-  // Format bytes per second
-  std::string rate;
-  if (result.bytes_per_second > 0) {
-    std::stringstream ss;
-    ss << " " << HumanReadableNumber(result.bytes_per_second) << "B/s";
-    rate = ss.str();
-  }
-
-  // Format items per second
-  std::string items;
-  if (result.items_per_second > 0) {
-    std::stringstream ss;
-    ss << " " << HumanReadableNumber(result.items_per_second) << " items/s";
-    items = ss.str();
-  }
-
-  ColorPrintf(COLOR_DEFAULT, "%s", Prefix());
-  ColorPrintf(COLOR_GREEN, "%-*s ",
-              name_field_width_, result.benchmark_name.c_str());
-  if (result.iterations == 0) {
-    ColorPrintf(COLOR_YELLOW, "%10.0f %10.0f ",
-                result.real_accumulated_time * 1e9,
-                result.cpu_accumulated_time * 1e9);
-  } else {
-    ColorPrintf(COLOR_YELLOW, "%10.0f %10.0f ",
-                (result.real_accumulated_time * 1e9) /
-                    (static_cast<double>(result.iterations)),
-                (result.cpu_accumulated_time * 1e9) /
-                    (static_cast<double>(result.iterations)));
-  }
-  ColorPrintf(COLOR_CYAN, "%10lld", result.iterations);
-  ColorPrintf(COLOR_DEFAULT, "%*s %*s %s %s\n",
-              13, rate.c_str(),
-              18, items.c_str(),
-              result.report_label.c_str(),
-              PrintMemoryUsage(result.max_heapbytes_used).c_str());
-}
-
-/* TODO(dominic)
-void MemoryUsage() {
-  // if (benchmark_mc) {
-  //  benchmark_mc->Reset();
-  //} else {
-  get_memory_usage = true;
-  //}
-}
-*/
-
-void PrintUsageAndExit() {
-  fprintf(stdout,
-          "benchmark [--benchmark_filter=<regex>]\n"
-          "          [--benchmark_iterations=<iterations>]\n"
-          "          [--benchmark_min_time=<min_time>]\n"
-          //"          [--benchmark_memory_usage]\n"
-          "          [--benchmark_repetitions=<num_repetitions>]\n"
-          "          [--color_print={true|false}]\n"
-          "          [--v=<verbosity>]\n");
-  exit(0);
-}
-
-void ParseCommandLineFlags(int* argc, const char** argv) {
-  for (int i = 1; i < *argc; ++i) {
-    if (ParseStringFlag(argv[i], "benchmark_filter", &FLAGS_benchmark_filter) ||
-        ParseInt32Flag(argv[i], "benchmark_iterations",
-                       &FLAGS_benchmark_iterations) ||
-        ParseDoubleFlag(argv[i], "benchmark_min_time",
-                        &FLAGS_benchmark_min_time) ||
-        // TODO(dominic)
-        //        ParseBoolFlag(argv[i], "gbenchmark_memory_usage",
-        //                      &FLAGS_gbenchmark_memory_usage) ||
-        ParseInt32Flag(argv[i], "benchmark_repetitions",
-                       &FLAGS_benchmark_repetitions) ||
-        ParseBoolFlag(argv[i], "color_print", &FLAGS_color_print) ||
-        ParseInt32Flag(argv[i], "v", &FLAGS_v)) {
-      for (int j = i; j != *argc; ++j) argv[j] = argv[j + 1];
-
-      --(*argc);
-      --i;
-    } else if (IsFlag(argv[i], "help"))
-      PrintUsageAndExit();
-  }
-}
-
-}  // end namespace internal
-
-// A clock that provides a fast mechanism to check if we're nearly done.
-class State::FastClock {
- public:
-  enum Type {
-    REAL_TIME,
-    CPU_TIME
-  };
-  explicit FastClock(Type type)
-      : type_(type),
-        approx_time_(NowMicros()),
-        bg_done_(false),
-        bg_(BGThreadWrapper, this) { }
-
-  ~FastClock() {
-    {
-      std::unique_lock<std::mutex> l(bg_mutex_);
-      bg_done_ = true;
-      bg_cond_.notify_one();
-    }
-    bg_.join();
-  }
-
-  // Returns true if the current time is guaranteed to be past "when_micros".
-  // This method is very fast.
-  inline bool HasReached(int64_t when_micros) {
-    return std::atomic_load(&approx_time_) >= when_micros;
-  }
-
-  // Returns the current time in microseconds past the epoch.
-  int64_t NowMicros() const {
-    double t = 0;
-    switch (type_) {
-      case REAL_TIME:
-        t = walltime::Now();
-        break;
-      case CPU_TIME:
-        t = MyCPUUsage() + ChildrenCPUUsage();
-        break;
-    }
-    return static_cast<int64_t>(t * kNumMicrosPerSecond);
-  }
-
-  // Reinitialize if necessary (since clock type may be change once benchmark
-  // function starts running - see UseRealTime).
-  void InitType(Type type) {
-    type_ = type;
-    std::lock_guard<std::mutex> l(bg_mutex_);
-    std::atomic_store(&approx_time_, NowMicros());
-  }
-
- private:
-  Type type_;
-  std::atomic<int64_t> approx_time_;  // Last time measurement taken by bg_
-  bool bg_done_;  // This is used to signal background thread to exit
-  std::mutex bg_mutex_;
-  std::condition_variable bg_cond_;
-  std::thread bg_;  // Background thread that updates last_time_ once every ms
-
-  static void* BGThreadWrapper(void* that) {
-    ((FastClock*)that)->BGThread();
-    return NULL;
-  }
-
-  void BGThread() {
-    std::unique_lock<std::mutex> l(bg_mutex_);
-    while (!bg_done_)
-    {
-      // Set timeout to 1 ms.
-      bg_cond_.wait_for(l, std::chrono::milliseconds(1));
-      std::atomic_store(&approx_time_, NowMicros());
-    }
-  }
-
-  BENCHMARK_DISALLOW_COPY_AND_ASSIGN(FastClock);
-};
-
-struct State::ThreadStats {
-  int64_t bytes_processed;
-  int64_t items_processed;
-
-  ThreadStats() { Reset(); }
-
-  void Reset() {
-    bytes_processed = 0;
-    items_processed = 0;
-  }
-
-  void Add(const ThreadStats& other) {
-    bytes_processed += other.bytes_processed;
-    items_processed += other.items_processed;
-  }
-};
-
-namespace internal {
-
-// Information kept per benchmark we may want to run
-struct Benchmark::Instance {
-  Instance()
-      : bm(nullptr),
-        threads(1),
-        rangeXset(false),
-        rangeX(kNoRange),
-        rangeYset(false),
-        rangeY(kNoRange) {}
-
-  std::string name;
-  Benchmark* bm;
-  int threads;  // Number of concurrent threads to use
-
-  bool rangeXset;
-  int rangeX;
-  bool rangeYset;
-  int rangeY;
-
-  bool multithreaded() const { return !bm->thread_counts_.empty(); }
-};
-
-}  // end namespace internal
-
-struct State::SharedState {
-  const internal::Benchmark::Instance* instance;
-  std::mutex mu;
-  std::condition_variable cond;
-  int starting;  // Number of threads that have entered STARTING state
-  int stopping;  // Number of threads that have entered STOPPING state
-  int exited;    // Number of threads that have complete exited
-  int threads;   // Number of total threads that are running concurrently
-  ThreadStats stats;
-  std::vector<BenchmarkReporter::Run> runs;  // accumulated runs
-  std::string label;
-
-  explicit SharedState(const internal::Benchmark::Instance* b)
-      : instance(b),
-        starting(0),
-        stopping(0),
-        exited(0),
-        threads(b == nullptr ? 1 : b->threads) { }
-
-  BENCHMARK_DISALLOW_COPY_AND_ASSIGN(SharedState);
-};
-
-namespace internal {
-
-Benchmark::Benchmark(const char* name, BenchmarkFunction f)
-    : name_(name), function_(f) {
+Benchmark::Benchmark(const char* name,
+                     Function* f)
+                    : name_(name), function_(f), arg_count_(-1) {
   registration_index_ = BenchmarkFamilies::GetInstance()->AddBenchmark(this);
 }
 
-Benchmark::~Benchmark() {
+Benchmark::~Benchmark()  {
   BenchmarkFamilies::GetInstance()->RemoveBenchmark(registration_index_);
 }
 
 Benchmark* Benchmark::Arg(int x) {
-  std::lock_guard<std::mutex> l(mutex_);
-  rangeX_.push_back(x);
+  CHECK(arg_count_ == -1 || arg_count_ == 1);
+  arg_count_ = 1;
+  args_.emplace_back(x, -1);
   return this;
 }
 
 Benchmark* Benchmark::Range(int start, int limit) {
+  CHECK(arg_count_ == -1 || arg_count_ == 1);
+  arg_count_ = 1;
   std::vector<int> arglist;
   AddRange(&arglist, start, limit, kRangeMultiplier);
 
-  std::lock_guard<std::mutex> l(mutex_);
-  for (size_t i = 0; i < arglist.size(); ++i) rangeX_.push_back(arglist[i]);
+  for (int i : arglist) {
+    args_.emplace_back(i, -1);
+  }
   return this;
 }
 
 Benchmark* Benchmark::DenseRange(int start, int limit) {
+  CHECK(arg_count_ == -1 || arg_count_ == 1);
+  arg_count_ = 1;
   CHECK_GE(start, 0);
   CHECK_LE(start, limit);
-  std::lock_guard<std::mutex> l(mutex_);
-  for (int arg = start; arg <= limit; ++arg) rangeX_.push_back(arg);
+  for (int arg = start; arg <= limit; arg++) {
+    args_.emplace_back(arg, -1);
+  }
   return this;
 }
 
 Benchmark* Benchmark::ArgPair(int x, int y) {
-  std::lock_guard<std::mutex> l(mutex_);
-  rangeX_.push_back(x);
-  rangeY_.push_back(y);
+  CHECK(arg_count_ == -1 || arg_count_ == 2);
+  arg_count_ = 2;
+  args_.emplace_back(x, y);
   return this;
 }
 
 Benchmark* Benchmark::RangePair(int lo1, int hi1, int lo2, int hi2) {
+  CHECK(arg_count_ == -1 || arg_count_ == 2);
+  arg_count_ = 2;
   std::vector<int> arglist1, arglist2;
   AddRange(&arglist1, lo1, hi1, kRangeMultiplier);
   AddRange(&arglist2, lo2, hi2, kRangeMultiplier);
 
-  std::lock_guard<std::mutex> l(mutex_);
-  rangeX_.resize(arglist1.size());
-  std::copy(arglist1.begin(), arglist1.end(), rangeX_.begin());
-  rangeY_.resize(arglist2.size());
-  std::copy(arglist2.begin(), arglist2.end(), rangeY_.begin());
+  for (int i : arglist1) {
+    for (int j : arglist2) {
+      args_.emplace_back(i, j);
+    }
+  }
   return this;
 }
 
@@ -655,7 +531,6 @@ Benchmark* Benchmark::Apply(void (*custom_arguments)(Benchmark* benchmark)) {
 
 Benchmark* Benchmark::Threads(int t) {
   CHECK_GT(t, 0);
-  std::lock_guard<std::mutex> l(mutex_);
   thread_counts_.push_back(t);
   return this;
 }
@@ -664,14 +539,13 @@ Benchmark* Benchmark::ThreadRange(int min_threads, int max_threads) {
   CHECK_GT(min_threads, 0);
   CHECK_GE(max_threads, min_threads);
 
-  std::lock_guard<std::mutex> l(mutex_);
   AddRange(&thread_counts_, min_threads, max_threads, 2);
   return this;
 }
 
 Benchmark* Benchmark::ThreadPerCpu() {
-  std::lock_guard<std::mutex> l(mutex_);
-  thread_counts_.push_back(NumCPUs());
+  static int num_cpus = NumCPUs();
+  thread_counts_.push_back(num_cpus);
   return this;
 }
 
@@ -682,443 +556,310 @@ void Benchmark::AddRange(std::vector<int>* dst, int lo, int hi, int mult) {
   // Add "lo"
   dst->push_back(lo);
 
+  static const int kint32max = std::numeric_limits<int32_t>::max();
+
   // Now space out the benchmarks in multiples of "mult"
-  for (int32_t i = 1; i < std::numeric_limits<int32_t>::max() / mult;
-       i *= mult) {
+  for (int32_t i = 1; i < kint32max/mult; i *= mult) {
     if (i >= hi) break;
-    if (i > lo) dst->push_back(i);
+    if (i > lo) {
+      dst->push_back(i);
+    }
   }
   // Add "hi" (if different from "lo")
-  if (hi != lo) dst->push_back(hi);
-}
-
-std::vector<Benchmark::Instance> Benchmark::CreateBenchmarkInstances(
-    size_t rangeXindex, size_t rangeYindex) {
-  // Special list of thread counts to use when none are specified
-  std::vector<int> one_thread;
-  one_thread.push_back(1);
-
-  std::vector<Benchmark::Instance> instances;
-
-  const bool is_multithreaded = (!thread_counts_.empty());
-  const std::vector<int>& thread_counts =
-      (is_multithreaded ? thread_counts_ : one_thread);
-  for (int num_threads : thread_counts) {
-    Instance instance;
-    instance.name = name_;
-    instance.bm = this;
-    instance.threads = num_threads;
-
-    if (rangeXindex != kNoRangeIndex) {
-      instance.rangeX = rangeX_[rangeXindex];
-      instance.rangeXset = true;
-      AppendHumanReadable(instance.rangeX, &instance.name);
-    }
-    if (rangeYindex != kNoRangeIndex) {
-      instance.rangeY = rangeY_[rangeYindex];
-      instance.rangeYset = true;
-      AppendHumanReadable(instance.rangeY, &instance.name);
-    }
-
-    // Add the number of threads used to the name
-    if (is_multithreaded) {
-      std::stringstream ss;
-      ss << "/threads:" << instance.threads;
-      instance.name += ss.str();
-    }
-
-    instances.push_back(instance);
+  if (hi != lo) {
+    dst->push_back(hi);
   }
-
-  return instances;
 }
 
-void Benchmark::MeasureOverhead() {
-  State::FastClock clock(State::FastClock::CPU_TIME);
-  State::SharedState state(nullptr);
-  State runner(&clock, &state, 0);
-  while (runner.KeepRunning()) {
-  }
-  overhead = state.runs[0].real_accumulated_time /
-             static_cast<double>(state.runs[0].iterations);
-  VLOG(1) << "Per-iteration overhead for doing nothing: " << overhead << "\n";
-}
+} // end namespace internal
 
-void Benchmark::RunInstance(const Instance& b, const BenchmarkReporter* br) {
-  use_real_time = false;
-  running_benchmark = true;
-  // get_memory_usage = FLAGS_gbenchmark_memory_usage;
-  State::FastClock clock(State::FastClock::CPU_TIME);
+namespace {
 
-  // Initialize the test runners.
-  State::SharedState state(&b);
+
+// Execute one thread of benchmark b for the specified number of iterations.
+// Adds the stats collected for the thread into *total.
+void RunInThread(const benchmark::internal::Benchmark::Instance* b,
+                 int iters, int thread_id,
+                 ThreadStats* total) EXCLUDES(GetBenchmarkLock()) {
+  State st(iters, b->has_arg1, b->arg1, b->has_arg2, b->arg2, thread_id);
+  b->function(st);
+  CHECK(st.iterations() == st.max_iterations) <<
+    "Benchmark returned before State::KeepRunning() returned false!";
   {
-    std::vector<std::unique_ptr<State>> runners;
-    for (int i = 0; i < b.threads; ++i)
-      runners.push_back(std::unique_ptr<State>(new State(&clock, &state, i)));
-
-    // Run them all.
-    for (int i = 0; i < b.threads; ++i) {
-      if (b.multithreaded())
-        runners[i]->RunAsThread();
-      else
-        runners[i]->Run();
-    }
-    if (b.multithreaded()) {
-      for (int i = 0; i < b.threads; ++i) runners[i]->Wait();
-    }
+    MutexLock l(GetBenchmarkLock());
+    total->bytes_processed += st.bytes_processed();
+    total->items_processed += st.items_processed();
   }
-  /*
-    double mem_usage = 0;
-    if (get_memory_usage) {
-      // Measure memory usage
-      Notification mem_done;
-      BenchmarkRun mem_run;
-      BenchmarkRun::SharedState mem_shared(&b, 1);
-      mem_run.Init(&clock, &mem_shared, 0);
+
+  timer_manager->Finalize();
+}
+
+void RunBenchmark(const benchmark::internal::Benchmark::Instance& b,
+                  const BenchmarkReporter* br) EXCLUDES(GetBenchmarkLock()) {
+  int iters = FLAGS_benchmark_iterations ? FLAGS_benchmark_iterations
+                                         : 1;
+  std::vector<BenchmarkReporter::Run> reports;
+
+  std::vector<std::thread> pool;
+  if (b.multithreaded)
+    pool.resize(b.threads);
+
+  for (int i = 0; i < FLAGS_benchmark_repetitions; i++) {
+    std::string mem;
+    while (true) {
+      // Try benchmark
+      VLOG(2) << "Running " << b.name << " for " << iters << "\n";
+
       {
-        testing::MallocCounter mc(testing::MallocCounter::THIS_THREAD_ONLY);
-        benchmark_mc = &mc;
-        mem_run.Run(&mem_done);
-        mem_done.WaitForNotification();
-        benchmark_mc = NULL;
-        mem_usage = mc.PeakHeapGrowth();
+        MutexLock l(GetBenchmarkLock());
+        GetReportLabel()->clear();
+        use_real_time = false;
       }
-    }
-  */
-  running_benchmark = false;
 
-  for (BenchmarkReporter::Run& report : state.runs) {
-    double seconds = (use_real_time ? report.real_accumulated_time
-                                    : report.cpu_accumulated_time);
-    report.benchmark_name = b.name;
-    report.report_label = state.label;
-    report.bytes_per_second = state.stats.bytes_processed / seconds;
-    report.items_per_second = state.stats.items_processed / seconds;
-    report.max_heapbytes_used = MeasurePeakHeapMemory(b);
-  }
+      Notification done;
+      timer_manager = std::unique_ptr<TimerManager>(new TimerManager(b.threads, &done));
 
-  br->ReportRuns(state.runs);
-}
+      ThreadStats total;
+      running_benchmark = true;
+      if (b.multithreaded) {
+        // If this is out first iteration of the while(true) loop then the
+        // threads haven't been started and can't be joined. Otherwise we need
+        // to join the thread before replacing them.
+        for (std::thread& thread : pool) {
+          if (thread.joinable())
+            thread.join();
+        }
+        for (std::size_t ti = 0; ti < pool.size(); ++ti) {
+            pool[ti] = std::thread(&RunInThread, &b, iters, ti, &total);
+        }
+      } else {
+        // Run directly in this thread
+        RunInThread(&b, iters, 0, &total);
+      }
+      done.WaitForNotification();
+      running_benchmark = false;
 
-// Run the specified benchmark, measure its peak memory usage, and
-// return the peak memory usage.
-double Benchmark::MeasurePeakHeapMemory(const Instance&) {
-  if (!get_memory_usage) return 0.0;
-  double bytes = 0.0;
-  /*  TODO(dominich)
-   // Should we do multi-threaded runs?
-   const int num_threads = 1;
-   const int num_iters = 1;
-   {
- //    internal::MallocCounter mc(internal::MallocCounter::THIS_THREAD_ONLY);
-     running_benchmark = true;
-     timer_manager = new TimerManager(1, NULL);
- //    benchmark_mc = &mc;
-     timer_manager->StartTimer();
+      const double cpu_accumulated_time = timer_manager->cpu_time_used();
+      const double real_accumulated_time = timer_manager->real_time_used();
+      timer_manager.reset();
 
-     b.Run(num_iters);
+      VLOG(2) << "Ran in " << cpu_accumulated_time << "/"
+              << real_accumulated_time << "\n";
 
-     running_benchmark = false;
-     delete timer_manager;
-     timer_manager = NULL;
- //    benchmark_mc = NULL;
- //    bytes = mc.PeakHeapGrowth();
-   }
-   */
-  return bytes;
-}
+      // Base decisions off of real time if requested by this benchmark.
+      double seconds = cpu_accumulated_time;
+      std::string label;
+      {
+        MutexLock l(GetBenchmarkLock());
+        label = *GetReportLabel();
+        if (use_real_time) {
+          seconds = real_accumulated_time;
+        }
+      }
 
-}  // end namespace internal
+      // If this was the first run, was elapsed time or cpu time large enough?
+      // If this is not the first run, go with the current value of iter.
+      if ((i > 0) ||
+          (iters == FLAGS_benchmark_iterations) ||
+          (iters >= kMaxIterations) ||
+          (seconds >= FLAGS_benchmark_min_time) ||
+          (real_accumulated_time >= 5*FLAGS_benchmark_min_time)) {
+        double bytes_per_second = 0;
+        if (total.bytes_processed > 0 && seconds != 0.0) {
+          bytes_per_second = (total.bytes_processed / seconds);
+        }
+        double items_per_second = 0;
+        if (total.items_processed > 0 && seconds != 0.0) {
+          items_per_second = (total.items_processed / seconds);
+        }
 
-State::State(FastClock* clock, SharedState* s, int t)
-    : thread_index(t),
-      state_(STATE_INITIAL),
-      clock_(clock),
-      shared_(s),
-      iterations_(0),
-      start_cpu_(0.0),
-      start_time_(0.0),
-      stop_time_micros_(0.0),
-      start_pause_cpu_(0.0),
-      pause_cpu_time_(0.0),
-      start_pause_real_(0.0),
-      pause_real_time_(0.0),
-      total_iterations_(0),
-      interval_micros_(static_cast<int64_t>(kNumMicrosPerSecond *
-                                            FLAGS_benchmark_min_time /
-                                            FLAGS_benchmark_repetitions)),
-      is_continuation_(false),
-      stats_(new ThreadStats()) {
-  CHECK(clock != nullptr);
-  CHECK(s != nullptr);
-}
+        // Create report about this benchmark run.
+        BenchmarkReporter::Run report;
+        report.benchmark_name = b.name;
+        report.report_label = label;
+        // Report the total iterations across all threads.
+        report.iterations = static_cast<int64_t>(iters) * b.threads;
+        report.real_accumulated_time = real_accumulated_time;
+        report.cpu_accumulated_time = cpu_accumulated_time;
+        report.bytes_per_second = bytes_per_second;
+        report.items_per_second = items_per_second;
+        reports.push_back(report);
+        break;
+      }
 
-bool State::KeepRunning() {
-  // Fast path
-  if ((FLAGS_benchmark_iterations == 0 &&
-       !clock_->HasReached(stop_time_micros_ +
-                           kNumMicrosPerSecond * pause_real_time_)) ||
-      iterations_ < FLAGS_benchmark_iterations) {
-    ++iterations_;
-    return true;
-  }
-
-  // To block thread 0 until all other threads exit, we have a signal exit
-  // point for KeepRunning() to return false.  The fast path above always
-  // returns true.
-  bool ret = false;
-  switch (state_) {
-    case STATE_INITIAL:
-      ret = StartRunning();
-      break;
-    case STATE_STARTING:
-      CHECK(false);
-      ret = true;
-      break;
-    case STATE_RUNNING:
-      ret = FinishInterval();
-      break;
-    case STATE_STOPPING:
-      ret = MaybeStop();
-      break;
-    case STATE_STOPPED:
-      CHECK(false);
-      ret = true;
-      break;
-  }
-
-  if (!ret && shared_->threads > 1 && thread_index == 0){
-    std::unique_lock<std::mutex> l(shared_->mu);
-
-    // Block until all other threads have exited.  We can then safely cleanup
-    // without other threads continuing to access shared variables inside the
-    // user-provided run function.
-    while (shared_->exited < shared_->threads - 1) {
-      shared_->cond.wait(l);
+      // See how much iterations should be increased by
+      // Note: Avoid division by zero with max(seconds, 1ns).
+      double multiplier = FLAGS_benchmark_min_time * 1.4 / std::max(seconds, 1e-9);
+      // If our last run was at least 10% of FLAGS_benchmark_min_time then we
+      // use the multiplier directly. Otherwise we use at most 10 times
+      // expansion.
+      // NOTE: When the last run was at least 10% of the min time the max
+      // expansion should be 14x.
+      bool is_significant = (seconds / FLAGS_benchmark_min_time) > 0.1;
+      multiplier = is_significant ? multiplier : std::min(10.0, multiplier);
+      if (multiplier <= 1.0) multiplier = 2.0;
+      double next_iters = std::max(multiplier * iters, iters + 1.0);
+      if (next_iters > kMaxIterations) {
+        next_iters = kMaxIterations;
+      }
+      VLOG(3) << "Next iters: " << next_iters << ", " << multiplier << "\n";
+      iters = static_cast<int>(next_iters + 0.5);
     }
   }
-
-  if (ret) {
-    ++iterations_;
+  br->ReportRuns(reports);
+  if (b.multithreaded) {
+    for (std::thread& thread : pool)
+      thread.join();
   }
-  return ret;
+}
+
+}  // namespace
+
+State::State(size_t max_iters, bool has_x, int x, bool has_y, int y,
+             int thread_i)
+    : started_(false), total_iterations_(0),
+      has_range_x_(has_x), range_x_(x),
+      has_range_y_(has_y), range_y_(y),
+      bytes_processed_(0), items_processed_(0),
+      thread_index(thread_i),
+      max_iterations(max_iters)
+{
+    CHECK(max_iterations != 0) << "At least one iteration must be run";
 }
 
 void State::PauseTiming() {
-  start_pause_cpu_ = MyCPUUsage() + ChildrenCPUUsage();
-  start_pause_real_ = walltime::Now();
+  // Add in time accumulated so far
+  CHECK(running_benchmark);
+  timer_manager->StopTimer();
 }
 
 void State::ResumeTiming() {
-  pause_cpu_time_ += MyCPUUsage() + ChildrenCPUUsage() - start_pause_cpu_;
-  pause_real_time_ += walltime::Now() - start_pause_real_;
+  CHECK(running_benchmark);
+  timer_manager->StartTimer();
 }
 
-void State::SetBytesProcessed(int64_t bytes) {
-  CHECK_EQ(STATE_STOPPED, state_);
-  std::lock_guard<std::mutex> l(shared_->mu);
-  stats_->bytes_processed = bytes;
+void State::UseRealTime() {
+  MutexLock l(GetBenchmarkLock());
+  use_real_time = true;
 }
 
-void State::SetItemsProcessed(int64_t items) {
-  CHECK_EQ(STATE_STOPPED, state_);
-  std::lock_guard<std::mutex> l(shared_->mu);
-  stats_->items_processed = items;
+void State::SetLabel(const char* label) {
+  CHECK(running_benchmark);
+  MutexLock l(GetBenchmarkLock());
+  *GetReportLabel() = label;
 }
 
-void State::SetLabel(const std::string& label) {
-  CHECK_EQ(STATE_STOPPED, state_);
-  std::lock_guard<std::mutex> l(shared_->mu);
-  shared_->label = label;
-}
-
-int State::range_x() const {
-  CHECK(shared_->instance->rangeXset);
-  /*
-  <<
-      "Failed to get range_x as it was not set. Did you register your "
-      "benchmark with a range parameter?";
-      */
-  return shared_->instance->rangeX;
-}
-
-int State::range_y() const {
-  CHECK(shared_->instance->rangeYset);
-  /* <<
-       "Failed to get range_y as it was not set. Did you register your "
-       "benchmark with a range parameter?";
-       */
-  return shared_->instance->rangeY;
-}
-
-bool State::StartRunning() {
-  bool last_thread = false;
-  {
-    std::lock_guard<std::mutex> l(shared_->mu);
-    CHECK_EQ(state_, STATE_INITIAL);
-    state_ = STATE_STARTING;
-    is_continuation_ = false;
-    CHECK_LT(shared_->starting, shared_->threads);
-    ++shared_->starting;
-    last_thread = shared_->starting == shared_->threads;
-  }
-
-  if (last_thread) {
-    clock_->InitType(use_real_time ? FastClock::REAL_TIME
-                                   : FastClock::CPU_TIME);
-    {
-      std::lock_guard<std::mutex> l(starting_mutex);
-      starting_cv.notify_all();
-    }
-  } else {
-    std::unique_lock<std::mutex> l(starting_mutex);
-    starting_cv.wait(l);
-  }
-  CHECK_EQ(state_, STATE_STARTING);
-  state_ = STATE_RUNNING;
-
-  NewInterval();
-  return true;
-}
-
-void State::NewInterval() {
-  stop_time_micros_ = clock_->NowMicros() + interval_micros_;
-  if (!is_continuation_) {
-    VLOG(1) << "Starting new interval; stopping in " << interval_micros_
-            << "\n";
-    iterations_ = 0;
-    pause_cpu_time_ = 0;
-    pause_real_time_ = 0;
-    start_cpu_ = MyCPUUsage() + ChildrenCPUUsage();
-    start_time_ = walltime::Now();
-  } else {
-    VLOG(1) << "Continuing interval; stopping in " << interval_micros_
-            << "\n";
-  }
-}
-
-bool State::FinishInterval() {
-  if ((FLAGS_benchmark_iterations != 0 &&
-       iterations_ <
-           FLAGS_benchmark_iterations / FLAGS_benchmark_repetitions) ||
-      iterations_ < 1) {
-    interval_micros_ *= 2;
-    VLOG(1) << "Not enough iterations in interval; "
-            << "Trying again for " << interval_micros_ << " useconds.\n";
-    is_continuation_ = false;
-    NewInterval();
-    return true;
-  }
-
-  BenchmarkReporter::Run data;
-  data.iterations = iterations_;
-  data.thread_index = thread_index;
-
-  const double accumulated_time = walltime::Now() - start_time_;
-  const double total_overhead = overhead * iterations_;
-  CHECK_LT(pause_real_time_, accumulated_time);
-  CHECK_LT(pause_real_time_ + total_overhead, accumulated_time);
-  data.real_accumulated_time =
-      accumulated_time - (pause_real_time_ + total_overhead);
-  data.cpu_accumulated_time = (MyCPUUsage() + ChildrenCPUUsage()) -
-                              (pause_cpu_time_ + start_cpu_);
-  total_iterations_ += iterations_;
-
-  bool keep_going = false;
-  {
-    std::lock_guard<std::mutex> l(shared_->mu);
-
-    // Either replace the last or add a new data point.
-    if (is_continuation_)
-      shared_->runs.back() = data;
-    else
-      shared_->runs.push_back(data);
-
-    if (FLAGS_benchmark_iterations != 0) {
-      // If we need more iterations, run another interval as a continuation.
-      keep_going = total_iterations_ < FLAGS_benchmark_iterations;
-      is_continuation_ = keep_going;
-    } else {
-      // If this is a repetition, run another interval as a new data point.
-      keep_going = shared_->runs.size() <
-                   static_cast<size_t>(FLAGS_benchmark_repetitions);
-      is_continuation_ = !keep_going;
-    }
-
-    if (!keep_going) {
-      ++shared_->stopping;
-      if (shared_->stopping < shared_->threads) {
-        // Other threads are still running, so continue running but without
-        // timing to present an expected background load to the other threads.
-        state_ = STATE_STOPPING;
-        keep_going = true;
-      } else {
-        state_ = STATE_STOPPED;
-      }
-    }
-  }
-
-  if (state_ == STATE_RUNNING) NewInterval();
-  return keep_going;
-}
-
-bool State::MaybeStop() {
-  std::lock_guard<std::mutex> l(shared_->mu);
-  if (shared_->stopping < shared_->threads) {
-    CHECK_EQ(state_, STATE_STOPPING);
-    return true;
-  }
-  state_ = STATE_STOPPED;
-  return false;
-}
-
-void State::Run() {
-  stats_->Reset();
-  shared_->instance->bm->function_(*this);
-  {
-    std::lock_guard<std::mutex> l(shared_->mu);
-    shared_->stats.Add(*stats_);
-  }
-}
-
-void State::RunAsThread() {
-  thread_ = std::thread(State::RunWrapper, this);
-}
-
-void State::Wait() {
-  if (thread_.joinable()) {
-    thread_.join();
-  }
-}
-
-// static
-void* State::RunWrapper(void* arg) {
-  State* that = (State*)arg;
-  CHECK(that != nullptr);
-  that->Run();
-
-  std::lock_guard<std::mutex> l(that->shared_->mu);
-
-  that->shared_->exited++;
-  if (that->thread_index > 0 &&
-      that->shared_->exited == that->shared_->threads - 1) {
-    // All threads but thread 0 have exited the user-provided run function.
-    // Thread 0 can now wake up and exit.
-    that->shared_->cond.notify_one();
-  }
-
-  return nullptr;
-}
+BenchmarkReporter::~BenchmarkReporter() {}
 
 namespace internal {
 
+bool ConsoleReporter::ReportContext(const Context& context) const {
+  name_field_width_ = context.name_field_width;
+
+  fprintf(stdout,
+          "Run on (%d X %0.0f MHz CPU%s)\n",
+          context.num_cpus,
+          context.mhz_per_cpu,
+          (context.num_cpus > 1) ? "s" : "");
+
+  int remainder_us;
+  std::string walltime_str = walltime::Print(
+                                walltime::Now(), "%Y/%m/%d-%H:%M:%S",
+                                true,  // use local timezone
+                                &remainder_us);
+  fprintf(stdout, "%s\n", walltime_str.c_str());
+
+  if (context.cpu_scaling_enabled) {
+    fprintf(stdout, "***WARNING*** CPU scaling is enabled, the benchmark "
+                    "timings may be noisy\n");
+  }
+
+#ifndef NDEBUG
+  fprintf(stdout, "Build Type: DEBUG\n");
+#endif
+
+  int output_width =
+      fprintf(stdout,
+              "%-*s %10s %10s %10s\n",
+              static_cast<int>(name_field_width_),
+              "Benchmark",
+              "Time(ns)", "CPU(ns)",
+              "Iterations");
+  fprintf(stdout, "%s\n", std::string(output_width - 1, '-').c_str());
+
+  return true;
+}
+
+void ConsoleReporter::ReportRuns(
+    const std::vector<Run>& reports) const {
+  if (reports.empty()) {
+    return;
+  }
+
+  for (Run const& run : reports) {
+    CHECK_EQ(reports[0].benchmark_name, run.benchmark_name);
+    PrintRunData(run);
+  }
+
+  if (reports.size() < 2) {
+    // We don't report aggregated data if there was a single run.
+    return;
+  }
+
+  Run mean_data;
+  Run stddev_data;
+  ComputeStats(reports, &mean_data, &stddev_data);
+
+  // Output using PrintRun.
+  PrintRunData(mean_data);
+  PrintRunData(stddev_data);
+  fprintf(stdout, "\n");
+}
+
+void ConsoleReporter::PrintRunData(const Run& result) const {
+  // Format bytes per second
+  std::string rate;
+  if (result.bytes_per_second > 0) {
+    rate = StrCat(" ", HumanReadableNumber(result.bytes_per_second), "B/s");
+  }
+
+  // Format items per second
+  std::string items;
+  if (result.items_per_second > 0) {
+    items = StrCat(" ", HumanReadableNumber(result.items_per_second),
+                   " items/s");
+  }
+
+  double const multiplier = 1e9; // nano second multiplier
+  ColorPrintf(COLOR_GREEN, "%-*s ",
+              name_field_width_, result.benchmark_name.c_str());
+  if (result.iterations == 0) {
+    ColorPrintf(COLOR_YELLOW, "%10.0f %10.0f ",
+                result.real_accumulated_time * multiplier,
+                result.cpu_accumulated_time * multiplier);
+  } else {
+    ColorPrintf(COLOR_YELLOW, "%10.0f %10.0f ",
+                (result.real_accumulated_time * multiplier) /
+                    (static_cast<double>(result.iterations)),
+                (result.cpu_accumulated_time * multiplier) /
+                    (static_cast<double>(result.iterations)));
+  }
+  ColorPrintf(COLOR_CYAN, "%10lld", result.iterations);
+  ColorPrintf(COLOR_DEFAULT, "%*s %*s %s\n",
+              13, rate.c_str(),
+              18, items.c_str(),
+              result.report_label.c_str());
+}
+
 void RunMatchingBenchmarks(const std::string& spec,
                            const BenchmarkReporter* reporter) {
+  CHECK(reporter != nullptr);
   if (spec.empty()) return;
 
-  std::vector<internal::Benchmark::Instance> benchmarks;
-  BenchmarkFamilies::GetInstance()->FindBenchmarks(spec, &benchmarks);
+  std::vector<benchmark::internal::Benchmark::Instance> benchmarks;
+  auto families = benchmark::internal::BenchmarkFamilies::GetInstance();
+  if (!families->FindBenchmarks(spec, &benchmarks)) return;
+
 
   // Determine the width of the name field using a minimum width of 10.
   // Also determine max number of threads needed.
@@ -1144,45 +885,78 @@ void RunMatchingBenchmarks(const std::string& spec,
   BenchmarkReporter::Context context;
   context.num_cpus = NumCPUs();
   context.mhz_per_cpu = CyclesPerSecond() / 1000000.0f;
-  //  context.cpu_info = base::CompactCPUIDInfoString();
+
   context.cpu_scaling_enabled = CpuScalingEnabled();
   context.name_field_width = name_field_width;
 
-  if (reporter->ReportContext(context))
-    for (internal::Benchmark::Instance& benchmark : benchmarks)
-      Benchmark::RunInstance(benchmark, reporter);
+  if (reporter->ReportContext(context)) {
+    for (const auto& benchmark : benchmarks) {
+      RunBenchmark(benchmark, reporter);
+    }
+  }
 }
 
-void FindMatchingBenchmarkNames(const std::string& spec,
-                                std::vector<std::string>* benchmark_names) {
-  if (spec.empty()) return;
+} // end namespace internal
 
-  std::vector<internal::Benchmark::Instance> benchmarks;
-  BenchmarkFamilies::GetInstance()->FindBenchmarks(spec, &benchmarks);
-  std::transform(benchmarks.begin(), benchmarks.end(), benchmark_names->begin(),
-                 [](const internal::Benchmark::Instance& b) { return b.name; });
-}
 
-}  // end namespace internal
-
-void RunSpecifiedBenchmarks(const BenchmarkReporter* reporter /*= nullptr*/) {
+void RunSpecifiedBenchmarks(const BenchmarkReporter* reporter) {
   std::string spec = FLAGS_benchmark_filter;
   if (spec.empty() || spec == "all")
     spec = ".";  // Regexp that matches all benchmarks
   internal::ConsoleReporter default_reporter;
-  internal::RunMatchingBenchmarks(
-      spec, reporter == nullptr ? &default_reporter : reporter);
+  internal::RunMatchingBenchmarks(spec, reporter ? reporter : &default_reporter);
 }
 
-void UseRealTime() { use_real_time = true; }
+namespace internal {
+
+void PrintUsageAndExit() {
+  fprintf(stdout,
+          "benchmark"
+          " [--benchmark_filter=<regex>]\n"
+          "          [--benchmark_iterations=<iterations>]\n"
+          "          [--benchmark_min_time=<min_time>]\n"
+          "          [--benchmark_repetitions=<num_repetitions>]\n"
+          "          [--color_print={true|false}]\n"
+          "          [--v=<verbosity>]\n");
+  exit(0);
+}
+
+void ParseCommandLineFlags(int* argc, const char** argv) {
+  using namespace benchmark;
+  for (int i = 1; i < *argc; ++i) {
+    if (
+        ParseStringFlag(argv[i], "benchmark_filter",
+                        &FLAGS_benchmark_filter) ||
+        ParseInt32Flag(argv[i], "benchmark_iterations",
+                       &FLAGS_benchmark_iterations) ||
+        ParseDoubleFlag(argv[i], "benchmark_min_time",
+                        &FLAGS_benchmark_min_time) ||
+        ParseInt32Flag(argv[i], "benchmark_repetitions",
+                       &FLAGS_benchmark_repetitions) ||
+        ParseBoolFlag(argv[i], "color_print",
+                       &FLAGS_color_print) ||
+        ParseInt32Flag(argv[i], "v", &FLAGS_v)) {
+      for (int j = i; j != *argc; ++j) argv[j] = argv[j + 1];
+
+      --(*argc);
+      --i;
+    } else if (IsFlag(argv[i], "help")) {
+      PrintUsageAndExit();
+    }
+  }
+}
+
+} // end namespace internal
 
 void Initialize(int* argc, const char** argv) {
   internal::ParseCommandLineFlags(argc, argv);
   internal::SetLogLevel(FLAGS_v);
-  // Ensure walltime is initialized by a single thread by forcing the
-  // initialization.
+  // TODO remove this. It prints some output the first time it is called.
+  // We don't want to have this ouput printed during benchmarking.
+  MyCPUUsage();
+  // The first call to walltime::Now initialized it. Call it once to
+  // prevent the initialization from happening in a benchmark.
   walltime::Now();
-  internal::Benchmark::MeasureOverhead();
 }
 
-}  // end namespace benchmark
+} // end namespace benchmark
diff --git a/src/mutex.h b/src/mutex.h
new file mode 100644
index 00000000..f37ec35b
--- /dev/null
+++ b/src/mutex.h
@@ -0,0 +1,142 @@
+#ifndef BENCHMARK_MUTEX_H_
+#define BENCHMARK_MUTEX_H_
+
+#include <mutex>
+#include <condition_variable>
+
+// Enable thread safety attributes only with clang.
+// The attributes can be safely erased when compiling with other compilers.
+#if defined(HAVE_THREAD_SAFETY_ATTRIBUTES)
+#define THREAD_ANNOTATION_ATTRIBUTE__(x)   __attribute__((x))
+#else
+#define THREAD_ANNOTATION_ATTRIBUTE__(x)   // no-op
+#endif
+
+#define CAPABILITY(x) \
+  THREAD_ANNOTATION_ATTRIBUTE__(capability(x))
+
+#define SCOPED_CAPABILITY \
+  THREAD_ANNOTATION_ATTRIBUTE__(scoped_lockable)
+
+#define GUARDED_BY(x) \
+  THREAD_ANNOTATION_ATTRIBUTE__(guarded_by(x))
+
+#define PT_GUARDED_BY(x) \
+  THREAD_ANNOTATION_ATTRIBUTE__(pt_guarded_by(x))
+
+#define ACQUIRED_BEFORE(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(acquired_before(__VA_ARGS__))
+
+#define ACQUIRED_AFTER(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(acquired_after(__VA_ARGS__))
+
+#define REQUIRES(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(requires_capability(__VA_ARGS__))
+
+#define REQUIRES_SHARED(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(requires_shared_capability(__VA_ARGS__))
+
+#define ACQUIRE(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(acquire_capability(__VA_ARGS__))
+
+#define ACQUIRE_SHARED(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(acquire_shared_capability(__VA_ARGS__))
+
+#define RELEASE(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(release_capability(__VA_ARGS__))
+
+#define RELEASE_SHARED(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(release_shared_capability(__VA_ARGS__))
+
+#define TRY_ACQUIRE(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(try_acquire_capability(__VA_ARGS__))
+
+#define TRY_ACQUIRE_SHARED(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(try_acquire_shared_capability(__VA_ARGS__))
+
+#define EXCLUDES(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(locks_excluded(__VA_ARGS__))
+
+#define ASSERT_CAPABILITY(x) \
+  THREAD_ANNOTATION_ATTRIBUTE__(assert_capability(x))
+
+#define ASSERT_SHARED_CAPABILITY(x) \
+  THREAD_ANNOTATION_ATTRIBUTE__(assert_shared_capability(x))
+
+#define RETURN_CAPABILITY(x) \
+  THREAD_ANNOTATION_ATTRIBUTE__(lock_returned(x))
+
+#define NO_THREAD_SAFETY_ANALYSIS \
+  THREAD_ANNOTATION_ATTRIBUTE__(no_thread_safety_analysis)
+
+
+namespace benchmark {
+
+typedef std::condition_variable Condition;
+
+// NOTE: Wrappers for std::mutex and std::unique_lock are provided so that
+// we can annotate them with thread safety attributes and use the
+// -Wthread-safety warning with clang. The standard library types cannot be
+// used directly because they do not provided the required annotations.
+class CAPABILITY("mutex") Mutex
+{
+public:
+  Mutex() {}
+
+  void lock() ACQUIRE() { mut_.lock(); }
+  void unlock() RELEASE() { mut_.unlock(); }
+  std::mutex& native_handle() {
+    return mut_;
+  }
+private:
+  std::mutex mut_;
+};
+
+
+class SCOPED_CAPABILITY MutexLock
+{
+  typedef std::unique_lock<std::mutex> MutexLockImp;
+public:
+  MutexLock(Mutex& m) ACQUIRE(m) : ml_(m.native_handle())
+  { }
+  ~MutexLock() RELEASE() {}
+  MutexLockImp& native_handle() { return ml_; }
+private:
+  MutexLockImp ml_;
+};
+
+
+class Notification
+{
+public:
+  Notification() : notified_yet_(false) { }
+
+  void WaitForNotification() const EXCLUDES(mutex_) {
+    MutexLock m_lock(mutex_);
+    auto notified_fn = [this]() REQUIRES(mutex_) {
+                            return this->HasBeenNotified();
+                        };
+    cv_.wait(m_lock.native_handle(), notified_fn);
+  }
+
+  void Notify() EXCLUDES(mutex_) {
+    {
+      MutexLock lock(mutex_);
+      notified_yet_ = 1;
+    }
+    cv_.notify_all();
+  }
+
+private:
+  bool HasBeenNotified() const REQUIRES(mutex_) {
+    return notified_yet_;
+  }
+
+  mutable Mutex mutex_;
+  mutable std::condition_variable cv_;
+  bool notified_yet_ GUARDED_BY(mutex_);
+};
+
+} // end namespace benchmark
+
+#endif // BENCHMARK_MUTEX_H_
diff --git a/src/string_util.cc b/src/string_util.cc
index 1be15341..ee1badc8 100644
--- a/src/string_util.cc
+++ b/src/string_util.cc
@@ -24,13 +24,13 @@ static_assert(arraysize(kBigSIUnits) == arraysize(kBigIECUnits),
 static_assert(arraysize(kSmallSIUnits) == arraysize(kBigSIUnits),
               "Small SI and Big SI unit arrays must be the same size");
 
-static const int kUnitsSize = arraysize(kBigSIUnits);
+static const int64_t kUnitsSize = arraysize(kBigSIUnits);
 
 } // end anonymous namespace
 
 void ToExponentAndMantissa(double val, double thresh, int precision,
                            double one_k, std::string* mantissa,
-                           int* exponent) {
+                           int64_t* exponent) {
   std::stringstream mantissa_stream;
 
   if (val < 0) {
@@ -80,10 +80,10 @@ void ToExponentAndMantissa(double val, double thresh, int precision,
   *mantissa = mantissa_stream.str();
 }
 
-std::string ExponentToPrefix(int exponent, bool iec) {
+std::string ExponentToPrefix(int64_t exponent, bool iec) {
   if (exponent == 0) return "";
 
-  const int index = (exponent > 0 ? exponent - 1 : -exponent - 1);
+  const int64_t index = (exponent > 0 ? exponent - 1 : -exponent - 1);
   if (index >= kUnitsSize) return "";
 
   const char* array =
@@ -97,7 +97,7 @@ std::string ExponentToPrefix(int exponent, bool iec) {
 std::string ToBinaryStringFullySpecified(double value, double threshold,
                                          int precision) {
   std::string mantissa;
-  int exponent;
+  int64_t exponent;
   ToExponentAndMantissa(value, threshold, precision, 1024.0, &mantissa,
                         &exponent);
   return mantissa + ExponentToPrefix(exponent, false);
diff --git a/src/sysinfo.cc b/src/sysinfo.cc
index ee3c238e..ace7caa4 100644
--- a/src/sysinfo.cc
+++ b/src/sysinfo.cc
@@ -34,6 +34,7 @@
 #include "check.h"
 #include "cycleclock.h"
 #include "internal_macros.h"
+#include "log.h"
 #include "sleep.h"
 
 namespace benchmark {
@@ -322,7 +323,7 @@ double MyCPUUsage() {
         return value;
       }
       // Once MyCPUUsageCPUTimeNsLocked fails once fall back to getrusage().
-      std::cout << "Reading /proc/self/cputime_ns failed. Using getrusage().\n";
+      VLOG(1) << "Reading /proc/self/cputime_ns failed. Using getrusage().\n";
       use_cputime_ns = false;
     }
   }
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 5d4721be..bc62f432 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -20,3 +20,6 @@ add_test(filter_regex_none filter_test --benchmark_filter=monkey 0)
 add_test(filter_regex_wildcard filter_test --benchmark_filter=.*Calculate.* 16)
 add_test(filter_regex_begin filter_test --benchmark_filter=^BM_Calculate.* 16)
 add_test(filter_regex_end filter_test --benchmark_filter=.*Pi$ 8)
+
+compile_benchmark_test(basic_test)
+add_test(basic basic_test)
diff --git a/test/basic_test.cc b/test/basic_test.cc
new file mode 100644
index 00000000..d14f577a
--- /dev/null
+++ b/test/basic_test.cc
@@ -0,0 +1,105 @@
+
+#include <cstddef>
+
+#include "benchmark/benchmark.h"
+
+#define BASIC_BENCHMARK_TEST(x) \
+    BENCHMARK(x)->Arg(8)->Arg(512)->Arg(8192)
+
+void BM_empty(benchmark::State& state) {
+  while (state.KeepRunning()) {
+    volatile std::size_t x = state.iterations();
+    ((void)x);
+  }
+}
+BENCHMARK(BM_empty);
+BENCHMARK(BM_empty)->ThreadPerCpu();
+
+void BM_spin_empty(benchmark::State& state) {
+  while (state.KeepRunning()) {
+    for (int x = 0; x < state.range_x(); ++x) {
+      volatile int dummy = x;
+      ((void)dummy);
+    }
+  }
+}
+BASIC_BENCHMARK_TEST(BM_spin_empty);
+BASIC_BENCHMARK_TEST(BM_spin_empty)->ThreadPerCpu();
+
+void BM_spin_pause_before(benchmark::State& state) {
+  for (int i = 0; i < state.range_x(); ++i) {
+    volatile int dummy = i;
+    ((void)dummy);
+  }
+  while(state.KeepRunning()) {
+    for (int i = 0; i < state.range_x(); ++i) {
+      volatile int dummy = i;
+      ((void)dummy);
+    }
+  }
+}
+BASIC_BENCHMARK_TEST(BM_spin_pause_before);
+BASIC_BENCHMARK_TEST(BM_spin_pause_before)->ThreadPerCpu();
+
+
+void BM_spin_pause_during(benchmark::State& state) {
+  while(state.KeepRunning()) {
+    state.PauseTiming();
+    for (int i = 0; i < state.range_x(); ++i) {
+      volatile int dummy = i;
+      ((void)dummy);
+    }
+    state.ResumeTiming();
+    for (int i = 0; i < state.range_x(); ++i) {
+      volatile int dummy = i;
+      ((void)dummy);
+    }
+  }
+}
+BASIC_BENCHMARK_TEST(BM_spin_pause_during);
+BASIC_BENCHMARK_TEST(BM_spin_pause_during)->ThreadPerCpu();
+
+
+void BM_spin_pause_after(benchmark::State& state) {
+  while(state.KeepRunning()) {
+    for (int i = 0; i < state.range_x(); ++i) {
+      volatile int dummy = i;
+      ((void)dummy);
+    }
+  }
+  for (int i = 0; i < state.range_x(); ++i) {
+    volatile int dummy = i;
+    ((void)dummy);
+  }
+}
+BASIC_BENCHMARK_TEST(BM_spin_pause_after);
+BASIC_BENCHMARK_TEST(BM_spin_pause_after)->ThreadPerCpu();
+
+
+void BM_spin_pause_before_and_after(benchmark::State& state) {
+  for (int i = 0; i < state.range_x(); ++i) {
+    volatile int dummy = i;
+    ((void)dummy);
+  }
+  while(state.KeepRunning()) {
+    for (int i = 0; i < state.range_x(); ++i) {
+      volatile int dummy = i;
+      ((void)dummy);
+    }
+  }
+  for (int i = 0; i < state.range_x(); ++i) {
+    volatile int dummy = i;
+    ((void)dummy);
+  }
+}
+BASIC_BENCHMARK_TEST(BM_spin_pause_before_and_after);
+BASIC_BENCHMARK_TEST(BM_spin_pause_before_and_after)->ThreadPerCpu();
+
+
+void BM_empty_stop_start(benchmark::State& state) {
+  while (state.KeepRunning()) { }
+}
+BENCHMARK(BM_empty_stop_start);
+BENCHMARK(BM_empty_stop_start)->ThreadPerCpu();
+
+BENCHMARK_MAIN()
diff --git a/test/benchmark_test.cc b/test/benchmark_test.cc
index 2ce1001d..d44ea319 100644
--- a/test/benchmark_test.cc
+++ b/test/benchmark_test.cc
@@ -53,18 +53,22 @@ static void BM_Factorial(benchmark::State& state) {
   while (state.KeepRunning())
     fac_42 = Factorial(8);
   // Prevent compiler optimizations
-  std::cout << fac_42;
+  std::stringstream ss;
+  ss << fac_42;
+  state.SetLabel(ss.str());
 }
 BENCHMARK(BM_Factorial);
 
 static void BM_FactorialRealTime(benchmark::State& state) {
-  benchmark::UseRealTime();
+  state.UseRealTime();
 
   int fac_42 = 0;
   while (state.KeepRunning())
     fac_42 = Factorial(8);
   // Prevent compiler optimizations
-  std::cout << fac_42;
+  std::stringstream ss;
+  ss << fac_42;
+  state.SetLabel(ss.str());
 }
 BENCHMARK(BM_FactorialRealTime);
 
@@ -158,12 +162,5 @@ static void BM_LongTest(benchmark::State& state) {
 }
 BENCHMARK(BM_LongTest)->Range(1<<16,1<<28);
 
-int main(int argc, const char* argv[]) {
-  benchmark::Initialize(&argc, argv);
-
-  assert(Factorial(8) == 40320);
-  assert(CalculatePi(1) == 0.0);
-
-  benchmark::RunSpecifiedBenchmarks();
-}
+BENCHMARK_MAIN()