diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2526faf4..4296b235 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -44,6 +44,10 @@ add_cxx_compiler_flag(-pedantic-errors)
 add_cxx_compiler_flag(-fno-strict-aliasing RELEASE)
 
 add_cxx_compiler_flag(-Wthread-safety)
+if (HAVE_WTHREAD_SAFETY)
+  add_definitions(-DHAVE_WTHREAD_SAFETY)
+  cxx_feature_check(THREAD_SAFETY_ATTRIBUTES)
+endif()
 
 # C++ feature checks
 cxx_feature_check(STD_REGEX)
diff --git a/cmake/thread_safety_attributes.cpp b/cmake/thread_safety_attributes.cpp
new file mode 100644
index 00000000..46161bab
--- /dev/null
+++ b/cmake/thread_safety_attributes.cpp
@@ -0,0 +1,4 @@
+#define HAVE_THREAD_SAFETY_ATTRIBUTES
+#include "../src/mutex.h"
+
+int main() {}
diff --git a/include/benchmark/benchmark.h b/include/benchmark/benchmark.h
index 5da915ea..50f27f0d 100644
--- a/include/benchmark/benchmark.h
+++ b/include/benchmark/benchmark.h
@@ -135,7 +135,8 @@ BENCHMARK(BM_MultiThreaded)->Threads(4);
 #ifndef BENCHMARK_BENCHMARK_H_
 #define BENCHMARK_BENCHMARK_H_
 
-#include <stdint.h>
+#include <cassert>
+#include <cstdint>
 
 #include <functional>
 #include <memory>
@@ -153,10 +154,7 @@ void Initialize(int* argc, const char** argv);
 
 // Otherwise, run all benchmarks specified by the --benchmark_filter flag,
 // and exit after running the benchmarks.
-void RunSpecifiedBenchmarks(const BenchmarkReporter* reporter = nullptr);
-
-// ------------------------------------------------------
-// Routines that can be called from within a benchmark
+void RunSpecifiedBenchmarks(const BenchmarkReporter* reporter = NULL);
 
 // If this routine is called, peak memory allocation past this point in the
 // benchmark is reported at the end of the benchmark report line. (It is
@@ -165,14 +163,6 @@ void RunSpecifiedBenchmarks(const BenchmarkReporter* reporter = nullptr);
 // TODO(dominic)
 // void MemoryUsage();
 
-// If a particular benchmark is I/O bound, or if for some reason CPU
-// timings are not representative, call this method from within the
-// benchmark routine.  If called, the elapsed time will be used to
-// control how many iterations are run, and in the printing of
-// items/second or MB/seconds values.  If not called, the cpu time
-// used by the benchmark will be used.
-void UseRealTime();
-
 namespace internal {
 class Benchmark;
 class BenchmarkFamilies;
@@ -181,13 +171,63 @@ class BenchmarkFamilies;
 // State is passed to a running Benchmark and contains state for the
 // benchmark to use.
 class State {
- public:
-  // Returns true iff the benchmark should continue through another iteration.
-  bool KeepRunning();
+public:
+  State(size_t max_iters, bool has_x, int x, bool has_y, int y, int thread_i);
 
+  // Returns true iff the benchmark should continue through another iteration.
+  // NOTE: A benchmark may not return from the test until KeepRunning() has
+  // returned false.
+  bool KeepRunning() {
+    if (BENCHMARK_BUILTIN_EXPECT(!started_, false)) {
+        ResumeTiming();
+        started_ = true;
+    }
+    bool const res = total_iterations_++ < max_iterations;
+    if (BENCHMARK_BUILTIN_EXPECT(!res, false)) {
+        assert(started_);
+        PauseTiming();
+        // Total iterations now is one greater than max iterations. Fix this.
+        total_iterations_ = max_iterations;
+    }
+    return res;
+  }
+
+  // REQUIRES: timer is running
+  // Stop the benchmark timer.  If not called, the timer will be
+  // automatically stopped after KeepRunning() returns false for the first time.
+  //
+  // For threaded benchmarks the PauseTiming() function acts
+  // like a barrier.  I.e., the ith call by a particular thread to this
+  // function will block until all threads have made their ith call.
+  // The timer will stop when the last thread has called this function.
+  //
+  // NOTE: PauseTiming()/ResumeTiming() are relatively
+  // heavyweight, and so their use should generally be avoided
+  // within each benchmark iteration, if possible.
   void PauseTiming();
+
+  // REQUIRES: timer is not running
+  // Start the benchmark timer.  The timer is NOT running on entrance to the
+  // benchmark function. It begins running after the first call to KeepRunning()
+  //
+  // For threaded benchmarks the ResumeTiming() function acts
+  // like a barrier.  I.e., the ith call by a particular thread to this
+  // function will block until all threads have made their ith call.
+  // The timer will start when the last thread has called this function.
+  //
+  // NOTE: PauseTiming()/ResumeTiming() are relatively
+  // heavyweight, and so their use should generally be avoided
+  // within each benchmark iteration, if possible.
   void ResumeTiming();
 
+  // If a particular benchmark is I/O bound, or if for some reason CPU
+  // timings are not representative, call this method from within the
+  // benchmark routine.  If called, the elapsed time will be used to
+  // control how many iterations are run, and in the printing of
+  // items/second or MB/seconds values.  If not called, the cpu time
+  // used by the benchmark will be used.
+  void UseRealTime();
+
   // Set the number of bytes processed by the current benchmark
   // execution.  This routine is typically called once at the end of a
   // throughput oriented benchmark.  If this routine is called with a
@@ -195,7 +235,15 @@ class State {
   // per iteration.
   //
   // REQUIRES: a benchmark has exited its KeepRunning loop.
-  void SetBytesProcessed(int64_t bytes);
+  BENCHMARK_ALWAYS_INLINE
+  void SetBytesProcessed(size_t bytes) {
+    bytes_processed_ = bytes;
+  }
+
+  BENCHMARK_ALWAYS_INLINE
+  size_t bytes_processed() const {
+    return bytes_processed_;
+  }
 
   // If this routine is called with items > 0, then an items/s
   // label is printed on the benchmark report line for the currently
@@ -203,94 +251,76 @@ class State {
   // benchmark where a processing items/second output is desired.
   //
   // REQUIRES: a benchmark has exited its KeepRunning loop.
-  void SetItemsProcessed(int64_t items);
+  BENCHMARK_ALWAYS_INLINE
+  void SetItemsProcessed(size_t items) {
+    items_processed_ = items;
+  }
+
+  BENCHMARK_ALWAYS_INLINE
+  size_t items_processed() const {
+    return items_processed_;
+  }
 
   // If this routine is called, the specified label is printed at the
   // end of the benchmark report line for the currently executing
   // benchmark.  Example:
-  //  static void BM_Compress(benchmark::State& state) {
+  //  static void BM_Compress(int iters) {
   //    ...
   //    double compress = input_size / output_size;
-  //    state.SetLabel(StringPrintf("compress:%.1f%%", 100.0*compression));
+  //    benchmark::SetLabel(StringPrintf("compress:%.1f%%", 100.0*compression));
   //  }
   // Produces output that looks like:
   //  BM_Compress   50         50   14115038  compress:27.3%
   //
   // REQUIRES: a benchmark has exited its KeepRunning loop.
-  void SetLabel(const std::string& label);
+  void SetLabel(const char* label);
+
+  // Allow the use of std::string without actually including <string>.
+  // This function does not participate in overload resolution unless StringType
+  // has the nested typename `basic_string`. This typename should be provided
+  // as an injected class name in the case of std::string.
+  template <class StringType>
+  void SetLabel(StringType const & str,
+                typename StringType::basic_string* = 0) {
+    this->SetLabel(str.c_str());
+  }
 
   // Range arguments for this run. CHECKs if the argument has been set.
-  int range_x() const;
-  int range_y() const;
+  BENCHMARK_ALWAYS_INLINE
+  int range_x() const {
+    assert(has_range_x_);
+    ((void)has_range_x_); // Prevent unused warning.
+    return range_x_;
+  }
 
-  int64_t iterations() const { return total_iterations_; }
+  BENCHMARK_ALWAYS_INLINE
+  int range_y() const {
+    assert(has_range_y_);
+    ((void)has_range_y_); // Prevent unused warning.
+    return range_y_;
+  }
 
+  BENCHMARK_ALWAYS_INLINE
+  size_t iterations() const { return total_iterations_; }
+
+private:
+  bool started_;
+  size_t total_iterations_;
+
+  bool has_range_x_;
+  int range_x_;
+
+  bool has_range_y_;
+  int range_y_;
+
+  size_t bytes_processed_;
+  size_t items_processed_;
+
+public:
   const int thread_index;
+  const size_t max_iterations;
 
- private:
-  class FastClock;
-  struct SharedState;
-  struct ThreadStats;
-
-  State(FastClock* clock, SharedState* s, int t);
-  bool StartRunning();
-  bool FinishInterval();
-  bool MaybeStop();
-  void NewInterval();
-  bool AllStarting();
-
-  static void* RunWrapper(void* arg);
-  void Run();
-  void RunAsThread();
-  void Wait();
-
-  enum EState {
-    STATE_INITIAL,   // KeepRunning hasn't been called
-    STATE_STARTING,  // KeepRunning called, waiting for other threads
-    STATE_RUNNING,   // Running and being timed
-    STATE_STOPPING,  // Not being timed but waiting for other threads
-    STATE_STOPPED    // Stopped
-  };
-
-  EState state_;
-
-  FastClock* clock_;
-
-  // State shared by all BenchmarkRun objects that belong to the same
-  // BenchmarkInstance
-  SharedState* shared_;
-
-  std::thread thread_;
-
-  // Custom label set by the user.
-  std::string label_;
-
-  // Each State object goes through a sequence of measurement intervals. By
-  // default each interval is approx. 100ms in length. The following stats are
-  // kept for each interval.
-  int64_t iterations_;
-  double start_cpu_;
-  double start_time_;
-  int64_t stop_time_micros_;
-
-  double start_pause_cpu_;
-  double pause_cpu_time_;
-  double start_pause_real_;
-  double pause_real_time_;
-
-  // Total number of iterations for all finished runs.
-  int64_t total_iterations_;
-
-  // Approximate time in microseconds for one interval of execution.
-  // Dynamically adjusted as needed.
-  int64_t interval_micros_;
-
-  // True if the current interval is the continuation of a previous one.
-  bool is_continuation_;
-
-  std::unique_ptr<ThreadStats> stats_;
-
-  friend class internal::Benchmark;
+private:
   BENCHMARK_DISALLOW_COPY_AND_ASSIGN(State);
 };
 
@@ -304,7 +334,6 @@ class BenchmarkReporter {
   struct Context {
     int num_cpus;
     double mhz_per_cpu;
-    // std::string cpu_info;
     bool cpu_scaling_enabled;
 
     // The number of chars in the longest benchmark name.
@@ -312,19 +341,17 @@ class BenchmarkReporter {
   };
 
   struct Run {
-    Run()
-        : thread_index(-1),
-          iterations(1),
-          real_accumulated_time(0),
-          cpu_accumulated_time(0),
-          bytes_per_second(0),
-          items_per_second(0),
-          max_heapbytes_used(0) {}
+    Run() :
+      iterations(1),
+      real_accumulated_time(0),
+      cpu_accumulated_time(0),
+      bytes_per_second(0),
+      items_per_second(0),
+      max_heapbytes_used(0) {}
 
     std::string benchmark_name;
-    std::string report_label;
-    int thread_index;
-    int64_t iterations;
+    std::string report_label;  // Empty if not set by benchmark.
+    size_t iterations;
     double real_accumulated_time;
     double cpu_accumulated_time;
 
@@ -350,22 +377,12 @@ class BenchmarkReporter {
   // benchmark, thus have the same name.
   virtual void ReportRuns(const std::vector<Run>& report) const = 0;
 
-  virtual ~BenchmarkReporter() {}
+  virtual ~BenchmarkReporter();
 };
 
 namespace internal {
 
-typedef std::function<void(State&)> BenchmarkFunction;
-
-// Run all benchmarks whose name is a partial match for the regular
-// expression in "spec". The results of benchmark runs are fed to "reporter".
-void RunMatchingBenchmarks(const std::string& spec,
-                           const BenchmarkReporter* reporter);
-
-// Extract the list of benchmark names that match the specified regular
-// expression.
-void FindMatchingBenchmarkNames(const std::string& re,
-                                std::vector<std::string>* benchmark_names);
+typedef void(Function)(State&);
 
 // ------------------------------------------------------
 // Benchmark registration object.  The BENCHMARK() macro expands
@@ -375,8 +392,7 @@ void FindMatchingBenchmarkNames(const std::string& re,
 // chained into one expression.
 class Benchmark {
  public:
-  // The Benchmark takes ownership of the Callback pointed to by f.
-  Benchmark(const char* name, BenchmarkFunction f);
+  Benchmark(const char* name, Function* f);
 
   ~Benchmark();
 
@@ -444,40 +460,25 @@ class Benchmark {
   // Used inside the benchmark implementation
   struct Instance;
 
-  // Measure the overhead of an empty benchmark to subtract later.
-  static void MeasureOverhead();
-
  private:
-  friend class BenchmarkFamilies;
-
-  std::vector<Benchmark::Instance> CreateBenchmarkInstances(size_t rangeXindex,
-                                                            size_t rangeYindex);
-
   std::string name_;
-  BenchmarkFunction function_;
-  size_t registration_index_;
-  std::vector<int> rangeX_;
-  std::vector<int> rangeY_;
+  Function* function_;
+  std::size_t registration_index_;
+  int arg_count_;
+  std::vector< std::pair<int, int> > args_;  // Args for all benchmark runs
   std::vector<int> thread_counts_;
-  std::mutex mutex_;
 
   // Special value placed in thread_counts_ to stand for NumCPUs()
   static const int kNumCpuMarker = -1;
 
-  // Special value used to indicate that no range is required.
-  static const size_t kNoRangeIndex = std::numeric_limits<size_t>::max();
-  static const int kNoRange = std::numeric_limits<int>::max();
-
   static void AddRange(std::vector<int>* dst, int lo, int hi, int mult);
-  static double MeasurePeakHeapMemory(const Instance& b);
-  static void RunInstance(const Instance& b, const BenchmarkReporter* br);
-  friend class ::benchmark::State;
-  friend struct ::benchmark::internal::Benchmark::Instance;
-  friend void ::benchmark::internal::RunMatchingBenchmarks(
-      const std::string&, const BenchmarkReporter*);
+
+  friend class BenchmarkFamilies;
+
   BENCHMARK_DISALLOW_COPY_AND_ASSIGN(Benchmark);
 };
 
+
 // ------------------------------------------------------
 // Internal implementation details follow; please ignore
 
@@ -487,16 +488,16 @@ class ConsoleReporter : public BenchmarkReporter {
  public:
   virtual bool ReportContext(const Context& context) const;
   virtual void ReportRuns(const std::vector<Run>& reports) const;
-
  private:
-  std::string PrintMemoryUsage(double bytes) const;
   virtual void PrintRunData(const Run& report) const;
+  // TODO(ericwf): Find a better way to share this information.
   mutable size_t name_field_width_;
 };
 
 }  // end namespace internal
 }  // end namespace benchmark
 
+
 // ------------------------------------------------------
 // Macro to register benchmarks
 
@@ -534,4 +535,11 @@ class ConsoleReporter : public BenchmarkReporter {
       __benchmark_, n, __LINE__) BENCHMARK_UNUSED =          \
       (new ::benchmark::internal::Benchmark(#n "<" #a "," #b ">", n<a, b>))
 
+// Helper macro to create a main routine in a test that runs the benchmarks
+#define BENCHMARK_MAIN()                             \
+  int main(int argc, const char** argv) {            \
+    ::benchmark::Initialize(&argc, argv);            \
+    ::benchmark::RunSpecifiedBenchmarks();           \
+  }
+
 #endif  // BENCHMARK_BENCHMARK_H_
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 5f22510d..f3a825f2 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -2,9 +2,8 @@
 include_directories(${PROJECT_SOURCE_DIR}/src)
 
 # Define the source files
-set(SOURCE_FILES "benchmark.cc" "colorprint.cc" "commandlineflags.cc"
-                 "log.cc" "sleep.cc" "string_util.cc" "sysinfo.cc"
-                 "walltime.cc")
+set(SOURCE_FILES "benchmark.cc" "colorprint.cc" "commandlineflags.cc" "log.cc"
+                 "sleep.cc" "string_util.cc" "sysinfo.cc" "walltime.cc")
 # Determine the correct regular expression engine to use
 if(HAVE_STD_REGEX)
   set(RE_FILES "re_std.cc")
diff --git a/src/benchmark.cc b/src/benchmark.cc
index d4f6f1b3..8b0682e6 100644
--- a/src/benchmark.cc
+++ b/src/benchmark.cc
@@ -13,30 +13,30 @@
 // limitations under the License.
 
 #include "benchmark/benchmark.h"
-#include "arraysize.h"
-#include "check.h"
-#include "colorprint.h"
-#include "commandlineflags.h"
-#include "internal_macros.h"
-#include "log.h"
-#include "re.h"
-#include "sleep.h"
-#include "stat.h"
-#include "string_util.h"
-#include "sysinfo.h"
-#include "walltime.h"
 
 #include <sys/time.h>
-#include <string.h>
+#include <sys/resource.h>
+#include <unistd.h>
 
+#include <cstdlib>
+#include <cstring>
 #include <algorithm>
 #include <atomic>
 #include <condition_variable>
 #include <iostream>
 #include <memory>
-#include <mutex>
 #include <thread>
-#include <sstream>
+
+#include "check.h"
+#include "commandlineflags.h"
+#include "colorprint.h"
+#include "log.h"
+#include "mutex.h"
+#include "re.h"
+#include "stat.h"
+#include "string_util.h"
+#include "sysinfo.h"
+#include "walltime.h"
 
 DEFINE_string(benchmark_filter, ".",
               "A regular expression that specifies the set of benchmarks "
@@ -57,124 +57,121 @@ DEFINE_double(benchmark_min_time, 0.5,
               "of the benchmark execution, regardless of number of "
               "threads.");
 
-DEFINE_bool(benchmark_memory_usage, false,
-            "Report memory usage for all benchmarks");
-
 DEFINE_int32(benchmark_repetitions, 1,
              "The number of runs of each benchmark. If greater than 1, the "
              "mean and standard deviation of the runs will be reported.");
 
-DEFINE_int32(v, 0, "The level of verbose logging to output");
 DEFINE_bool(color_print, true, "Enables colorized logging.");
 
-// Will be non-empty if heap checking is turned on, which would
-// invalidate any benchmarks.
-DECLARE_string(heap_check);
+DEFINE_int32(v, 0, "The level of verbose logging to output");
+
 
 // The ""'s catch people who don't pass in a literal for "str"
 #define strliterallen(str) (sizeof("" str "") - 1)
 
 // Must use a string literal for prefix.
-#define memprefix(str, len, prefix)                  \
-  ((((len) >= strliterallen(prefix)) &&              \
-    memcmp(str, prefix, strliterallen(prefix)) == 0) \
-       ? str + strliterallen(prefix)                 \
-       : NULL)
+#define memprefix(str, len, prefix)                       \
+  ((((len) >= strliterallen(prefix)) &&                   \
+    std::memcmp(str, prefix, strliterallen(prefix)) == 0) \
+       ? str + strliterallen(prefix)                      \
+       : nullptr)
+
 
 namespace benchmark {
+
+namespace internal {
+
+// NOTE: This is a dummy "mutex" type used to denote the actual mutex
+// returned by GetBenchmarkLock(). This is only used to placate the thread
+// safety warnings by giving the return of GetBenchmarkLock() a name.
+struct CAPABILITY("mutex") BenchmarkLockType {};
+BenchmarkLockType BenchmarkLockVar;
+
+} // end namespace internal
+
+inline Mutex& RETURN_CAPABILITY(::benchmark::internal::BenchmarkLockVar)
+GetBenchmarkLock()
+{
+  static Mutex lock;
+  return lock;
+}
+
 namespace {
+
 // For non-dense Range, intermediate values are powers of kRangeMultiplier.
 static const int kRangeMultiplier = 8;
-
-std::mutex starting_mutex;
-std::condition_variable starting_cv;
+static const int kMaxIterations = 1000000000;
 
 bool running_benchmark = false;
 
-// Should this benchmark report memory usage?
-bool get_memory_usage;
+// Global variable so that a benchmark can cause a little extra printing
+std::string* GetReportLabel() {
+    static std::string label GUARDED_BY(GetBenchmarkLock());
+    return &label;
+}
 
 // Should this benchmark base decisions off of real time rather than
 // cpu time?
-bool use_real_time;
+bool use_real_time GUARDED_BY(GetBenchmarkLock());
 
-// Overhead of an empty benchmark.
-double overhead = 0.0;
+// TODO(ericwf): support MallocCounter.
+//static benchmark::MallocCounter *benchmark_mc;
 
-// Return prefix to print in front of each reported line
-const char* Prefix() {
-#ifdef NDEBUG
-  return "";
-#else
-  return "DEBUG: ";
-#endif
-}
-
-// TODO
-// static internal::MallocCounter *benchmark_mc;
-
-bool CpuScalingEnabled() {
+static bool CpuScalingEnabled() {
   // On Linux, the CPUfreq subsystem exposes CPU information as files on the
   // local file system. If reading the exported files fails, then we may not be
   // running on Linux, so we silently ignore all the read errors.
   for (int cpu = 0, num_cpus = NumCPUs(); cpu < num_cpus; ++cpu) {
-    std::stringstream ss;
-    ss << "/sys/devices/system/cpu/cpu" << cpu << "/cpufreq/scaling_governor";
-    std::string governor_file = ss.str();
+    std::string governor_file = StrCat("/sys/devices/system/cpu/cpu", cpu,
+                                       "/cpufreq/scaling_governor");
     FILE* file = fopen(governor_file.c_str(), "r");
     if (!file) break;
     char buff[16];
     size_t bytes_read = fread(buff, 1, sizeof(buff), file);
     fclose(file);
-    if (memprefix(buff, bytes_read, "performance") == NULL) return true;
+    if (memprefix(buff, bytes_read, "performance") == nullptr) return true;
   }
   return false;
 }
 
-// Given a collection of reports, computes their mean and stddev.
-// REQUIRES: all runs in "reports" must be from the same benchmark.
 void ComputeStats(const std::vector<BenchmarkReporter::Run>& reports,
                   BenchmarkReporter::Run* mean_data,
                   BenchmarkReporter::Run* stddev_data) {
+  CHECK(reports.size() >= 2) << "Cannot compute stats for less than 2 reports";
   // Accumulators.
   Stat1_d real_accumulated_time_stat;
   Stat1_d cpu_accumulated_time_stat;
-  Stat1_d items_per_second_stat;
   Stat1_d bytes_per_second_stat;
-  Stat1_d iterations_stat;
-  Stat1MinMax_d max_heapbytes_used_stat;
+  Stat1_d items_per_second_stat;
+  // All repetitions should be run with the same number of iterations so we
+  // can take this information from the first benchmark.
+  std::size_t const run_iterations = reports.front().iterations;
 
   // Populate the accumulators.
-  for (std::vector<BenchmarkReporter::Run>::const_iterator it = reports.begin();
-       it != reports.end(); ++it) {
-    CHECK_EQ(reports[0].benchmark_name, it->benchmark_name);
+  for (BenchmarkReporter::Run const& run : reports) {
+    CHECK_EQ(reports[0].benchmark_name, run.benchmark_name);
+    CHECK_EQ(run_iterations, run.iterations);
     real_accumulated_time_stat +=
-        Stat1_d(it->real_accumulated_time / it->iterations, it->iterations);
+        Stat1_d(run.real_accumulated_time/run.iterations, run.iterations);
     cpu_accumulated_time_stat +=
-        Stat1_d(it->cpu_accumulated_time / it->iterations, it->iterations);
-    items_per_second_stat += Stat1_d(it->items_per_second, it->iterations);
-    bytes_per_second_stat += Stat1_d(it->bytes_per_second, it->iterations);
-    iterations_stat += Stat1_d(it->iterations, it->iterations);
-    max_heapbytes_used_stat +=
-        Stat1MinMax_d(it->max_heapbytes_used, it->iterations);
+        Stat1_d(run.cpu_accumulated_time/run.iterations, run.iterations);
+    items_per_second_stat += Stat1_d(run.items_per_second, run.iterations);
+    bytes_per_second_stat += Stat1_d(run.bytes_per_second, run.iterations);
   }
 
-  // Get the data from the accumulator to BenchmarkRunData's.  In the
-  // computations below we must multiply by the number of iterations since
-  // PrintRunData will divide by it.
+  // Get the data from the accumulator to BenchmarkReporter::Run's.
   mean_data->benchmark_name = reports[0].benchmark_name + "_mean";
-  mean_data->iterations = iterations_stat.Mean();
+  mean_data->iterations = run_iterations;
   mean_data->real_accumulated_time = real_accumulated_time_stat.Mean() *
-                                     mean_data->iterations;
+                                     run_iterations;
   mean_data->cpu_accumulated_time = cpu_accumulated_time_stat.Mean() *
-                                    mean_data->iterations;
+                                    run_iterations;
   mean_data->bytes_per_second = bytes_per_second_stat.Mean();
   mean_data->items_per_second = items_per_second_stat.Mean();
-  mean_data->max_heapbytes_used = max_heapbytes_used_stat.Max();
 
   // Only add label to mean/stddev if it is same for all runs
   mean_data->report_label = reports[0].report_label;
-  for (size_t i = 1; i < reports.size(); i++) {
+  for (std::size_t i = 1; i < reports.size(); i++) {
     if (reports[i].report_label != reports[0].report_label) {
       mean_data->report_label = "";
       break;
@@ -183,29 +180,166 @@ void ComputeStats(const std::vector<BenchmarkReporter::Run>& reports,
 
   stddev_data->benchmark_name = reports[0].benchmark_name + "_stddev";
   stddev_data->report_label = mean_data->report_label;
-  stddev_data->iterations = iterations_stat.StdDev();
-  // The value of iterations_stat.StdDev() above may be 0 if all the repetitions
-  // have the same number of iterations.  Blindly multiplying by 0 in the
-  // computation of real/cpu_accumulated_time below would lead to 0/0 in
-  // PrintRunData.  So we skip the multiplication in this case and PrintRunData
-  // skips the division.
-  if (stddev_data->iterations == 0) {
-    stddev_data->real_accumulated_time = real_accumulated_time_stat.StdDev();
-    stddev_data->cpu_accumulated_time = cpu_accumulated_time_stat.StdDev();
-  } else {
-    stddev_data->real_accumulated_time = real_accumulated_time_stat.StdDev() *
-                                         stddev_data->iterations;
-    stddev_data->cpu_accumulated_time = cpu_accumulated_time_stat.StdDev() *
-                                        stddev_data->iterations;
-  }
+  stddev_data->iterations = 0;
+  stddev_data->real_accumulated_time =
+      real_accumulated_time_stat.StdDev();
+  stddev_data->cpu_accumulated_time =
+      cpu_accumulated_time_stat.StdDev();
   stddev_data->bytes_per_second = bytes_per_second_stat.StdDev();
   stddev_data->items_per_second = items_per_second_stat.StdDev();
-  stddev_data->max_heapbytes_used = max_heapbytes_used_stat.StdDev();
 }
-}  // namespace
+
+struct ThreadStats {
+    ThreadStats() : bytes_processed(0), items_processed(0) {}
+    int64_t bytes_processed;
+    int64_t items_processed;
+};
+
+// Timer management class
+class TimerManager {
+ public:
+  TimerManager(int num_threads, Notification* done)
+      : num_threads_(num_threads),
+        done_(done),
+        running_(false),
+        real_time_used_(0),
+        cpu_time_used_(0),
+        num_finalized_(0),
+        phase_number_(0),
+        entered_(0) {
+  }
+
+  // Called by each thread
+  void StartTimer() EXCLUDES(lock_) {
+    bool last_thread = false;
+    {
+      MutexLock ml(lock_);
+      last_thread = Barrier(ml);
+      if (last_thread) {
+        CHECK(!running_) << "Called StartTimer when timer is already running";
+        running_ = true;
+        start_real_time_ = walltime::Now();
+        start_cpu_time_ = MyCPUUsage() + ChildrenCPUUsage();
+       }
+     }
+     if (last_thread) {
+       phase_condition_.notify_all();
+     }
+  }
+
+  // Called by each thread
+  void StopTimer() EXCLUDES(lock_) {
+    bool last_thread = false;
+    {
+      MutexLock ml(lock_);
+      last_thread = Barrier(ml);
+      if (last_thread) {
+        CHECK(running_) << "Called StopTimer when timer is already stopped";
+        InternalStop();
+      }
+    }
+    if (last_thread) {
+      phase_condition_.notify_all();
+    }
+  }
+
+  // Called by each thread
+  void Finalize() EXCLUDES(lock_) {
+    MutexLock l(lock_);
+    num_finalized_++;
+    if (num_finalized_ == num_threads_) {
+      CHECK(!running_) <<
+        "The timer should be stopped before the timer is finalized";
+      done_->Notify();
+    }
+  }
+
+  // REQUIRES: timer is not running
+  double real_time_used() EXCLUDES(lock_) {
+    MutexLock l(lock_);
+    CHECK(!running_);
+    return real_time_used_;
+  }
+
+  // REQUIRES: timer is not running
+  double cpu_time_used() EXCLUDES(lock_) {
+    MutexLock l(lock_);
+    CHECK(!running_);
+    return cpu_time_used_;
+  }
+
+ private:
+  Mutex lock_;
+  Condition phase_condition_;
+  int num_threads_;
+  Notification* done_;
+
+  bool running_;                // Is the timer running
+  double start_real_time_;      // If running_
+  double start_cpu_time_;       // If running_
+
+  // Accumulated time so far (does not contain current slice if running_)
+  double real_time_used_;
+  double cpu_time_used_;
+
+  // How many threads have called Finalize()
+  int num_finalized_;
+
+  // State for barrier management
+  int phase_number_;
+  int entered_;         // Number of threads that have entered this barrier
+
+  void InternalStop() REQUIRES(lock_) {
+    CHECK(running_);
+    running_ = false;
+    real_time_used_ += walltime::Now() - start_real_time_;
+    cpu_time_used_ += ((MyCPUUsage() + ChildrenCPUUsage())
+                       - start_cpu_time_);
+  }
+
+  // Enter the barrier and wait until all other threads have also
+  // entered the barrier.  Returns iff this is the last thread to
+  // enter the barrier.
+  bool Barrier(MutexLock& ml) REQUIRES(lock_) {
+    CHECK_LT(entered_, num_threads_);
+    entered_++;
+    if (entered_ < num_threads_) {
+      // Wait for all threads to enter
+      int phase_number_cp = phase_number_;
+      auto cb = [this, phase_number_cp]() {
+        return this->phase_number_ > phase_number_cp;
+      };
+      phase_condition_.wait(ml.native_handle(), cb);
+      return false;  // I was not the last one
+    } else {
+      // Last thread has reached the barrier
+      phase_number_++;
+      entered_ = 0;
+      return true;
+    }
+  }
+};
+
+// TimerManager for current run.
+static std::unique_ptr<TimerManager> timer_manager = nullptr;
+
+} // end namespace
 
 namespace internal {
 
+// Information kept per benchmark we may want to run
+struct Benchmark::Instance {
+  std::string   name;
+  Function*     function;
+  bool          has_arg1;
+  int           arg1;
+  bool          has_arg2;
+  int           arg2;
+  int           threads;    // Number of concurrent threads to use
+  bool          multithreaded;  // Is benchmark multi-threaded?
+};
+
+
 // Class for managing registered benchmarks.  Note that each registered
 // benchmark identifies a family of related benchmarks to run.
 class BenchmarkFamilies {
@@ -220,16 +354,17 @@ class BenchmarkFamilies {
 
   // Extract the list of benchmark instances that match the specified
   // regular expression.
-  void FindBenchmarks(const std::string& re,
+  bool FindBenchmarks(const std::string& re,
                       std::vector<Benchmark::Instance>* benchmarks);
  private:
   BenchmarkFamilies();
   ~BenchmarkFamilies();
 
   std::vector<Benchmark*> families_;
-  std::mutex mutex_;
+  Mutex mutex_;
 };
 
+
 BenchmarkFamilies* BenchmarkFamilies::GetInstance() {
   static BenchmarkFamilies instance;
   return &instance;
@@ -244,7 +379,7 @@ BenchmarkFamilies::~BenchmarkFamilies() {
 }
 
 size_t BenchmarkFamilies::AddBenchmark(Benchmark* family) {
-  std::lock_guard<std::mutex> l(mutex_);
+  MutexLock l(mutex_);
   // This loop attempts to reuse an entry that was previously removed to avoid
   // unncessary growth of the vector.
   for (size_t index = 0; index < families_.size(); ++index) {
@@ -259,392 +394,133 @@ size_t BenchmarkFamilies::AddBenchmark(Benchmark* family) {
 }
 
 void BenchmarkFamilies::RemoveBenchmark(size_t index) {
-  std::lock_guard<std::mutex> l(mutex_);
-  families_[index] = NULL;
+  MutexLock l(mutex_);
+  families_[index] = nullptr;
   // Don't shrink families_ here, we might be called by the destructor of
   // BenchmarkFamilies which iterates over the vector.
 }
 
-void BenchmarkFamilies::FindBenchmarks(
+bool BenchmarkFamilies::FindBenchmarks(
     const std::string& spec,
     std::vector<Benchmark::Instance>* benchmarks) {
   // Make regular expression out of command-line flag
+  std::string error_msg;
   Regex re;
-  std::string re_error;
-  if (!re.Init(spec, &re_error)) {
-    std::cerr << "Could not compile benchmark re: " << re_error << std::endl;
-    return;
+  if (!re.Init(spec, &error_msg)) {
+    std::cerr << "Could not compile benchmark re: " << error_msg << std::endl;
+    return false;
   }
 
-  std::lock_guard<std::mutex> l(mutex_);
-  for (internal::Benchmark* family : families_) {
-    if (family == nullptr) continue;  // Family was deleted
+  // Special list of thread counts to use when none are specified
+  std::vector<int> one_thread;
+  one_thread.push_back(1);
 
-    // Match against filter.
-    if (!re.Match(family->name_)) {
-      VLOG(1) << "Skipping " << family->name_ << "\n";
-      continue;
+  MutexLock l(mutex_);
+  for (Benchmark* family : families_) {
+    // Family was deleted or benchmark doesn't match
+    if (family == nullptr || !re.Match(family->name_)) continue;
+
+    if (family->arg_count_ == -1) {
+      family->arg_count_ = 0;
+      family->args_.emplace_back(-1, -1);
     }
+    for (auto const& args : family->args_) {
+      const std::vector<int>* thread_counts =
+        (family->thread_counts_.empty()
+         ? &one_thread
+         : &family->thread_counts_);
+      for (int num_threads : *thread_counts) {
 
-    std::vector<Benchmark::Instance> instances;
-    if (family->rangeX_.empty() && family->rangeY_.empty()) {
-      instances = family->CreateBenchmarkInstances(
-        Benchmark::kNoRangeIndex, Benchmark::kNoRangeIndex);
-      std::copy(instances.begin(), instances.end(),
-                std::back_inserter(*benchmarks));
-    } else if (family->rangeY_.empty()) {
-      for (size_t x = 0; x < family->rangeX_.size(); ++x) {
-        instances = family->CreateBenchmarkInstances(
-          x, Benchmark::kNoRangeIndex);
-        std::copy(instances.begin(), instances.end(),
-                  std::back_inserter(*benchmarks));
-      }
-    } else {
-      for (size_t x = 0; x < family->rangeX_.size(); ++x) {
-        for (size_t y = 0; y < family->rangeY_.size(); ++y) {
-          instances = family->CreateBenchmarkInstances(x, y);
-          std::copy(instances.begin(), instances.end(),
-                    std::back_inserter(*benchmarks));
+        Benchmark::Instance instance;
+        instance.name = family->name_;
+        instance.function = family->function_;
+        instance.has_arg1 = family->arg_count_ >= 1;
+        instance.arg1 = args.first;
+        instance.has_arg2 = family->arg_count_ == 2;
+        instance.arg2 = args.second;
+        instance.threads = num_threads;
+        instance.multithreaded = !(family->thread_counts_.empty());
+
+        // Add arguments to instance name
+        if (family->arg_count_ >= 1) {
+          AppendHumanReadable(instance.arg1, &instance.name);
         }
+        if (family->arg_count_ >= 2) {
+          AppendHumanReadable(instance.arg2, &instance.name);
+        }
+
+        // Add the number of threads used to the name
+        if (!family->thread_counts_.empty()) {
+          instance.name += StringPrintF("/threads:%d", instance.threads);
+        }
+
+        benchmarks->push_back(instance);
       }
     }
   }
-}
-
-std::string ConsoleReporter::PrintMemoryUsage(double bytes) const {
-  if (!get_memory_usage || bytes < 0.0) return "";
-
-  std::stringstream ss;
-  ss << " " << HumanReadableNumber(bytes) << "B peak-mem";
-  return ss.str();
-}
-
-bool ConsoleReporter::ReportContext(const BenchmarkReporter::Context& context)
-    const {
-  name_field_width_ = context.name_field_width;
-
-  std::cout << "Benchmarking on " << context.num_cpus << " X "
-            << context.mhz_per_cpu << " MHz CPU"
-            << ((context.num_cpus > 1) ? "s" : "") << "\n";
-
-  int remainder_ms;
-  std::cout << walltime::Print(walltime::Now(), "%Y/%m/%d-%H:%M:%S",
-                               true,  // use local timezone
-                               &remainder_ms) << "\n";
-
-  // Show details of CPU model, caches, TLBs etc.
-  //  if (!context.cpu_info.empty())
-  //    std::cout << "CPU: " << context.cpu_info.c_str();
-
-  if (context.cpu_scaling_enabled) {
-    std::cerr << "CPU scaling is enabled: Benchmark timings may be noisy.\n";
-  }
-
-  int output_width = fprintf(stdout, "%s%-*s %10s %10s %10s\n",
-                             Prefix(), int(name_field_width_), "Benchmark",
-                             "Time(ns)", "CPU(ns)", "Iterations");
-  std::cout << std::string(output_width - 1, '-').c_str() << "\n";
-
   return true;
 }
 
-void ConsoleReporter::ReportRuns(
-    const std::vector<BenchmarkReporter::Run>& reports) const {
-  for (std::vector<BenchmarkReporter::Run>::const_iterator it = reports.begin();
-       it != reports.end(); ++it) {
-    CHECK_EQ(reports[0].benchmark_name, it->benchmark_name);
-    PrintRunData(*it);
-  }
 
-  // We don't report aggregated data if there was a single run.
-  if (reports.size() < 2) return;
-
-  BenchmarkReporter::Run mean_data;
-  BenchmarkReporter::Run stddev_data;
-  ComputeStats(reports, &mean_data, &stddev_data);
-
-  PrintRunData(mean_data);
-  PrintRunData(stddev_data);
-}
-
-void ConsoleReporter::PrintRunData(const BenchmarkReporter::Run& result) const {
-  // Format bytes per second
-  std::string rate;
-  if (result.bytes_per_second > 0) {
-    std::stringstream ss;
-    ss << " " << HumanReadableNumber(result.bytes_per_second) << "B/s";
-    rate = ss.str();
-  }
-
-  // Format items per second
-  std::string items;
-  if (result.items_per_second > 0) {
-    std::stringstream ss;
-    ss << " " << HumanReadableNumber(result.items_per_second) << " items/s";
-    items = ss.str();
-  }
-
-  ColorPrintf(COLOR_DEFAULT, "%s", Prefix());
-  ColorPrintf(COLOR_GREEN, "%-*s ",
-              name_field_width_, result.benchmark_name.c_str());
-  if (result.iterations == 0) {
-    ColorPrintf(COLOR_YELLOW, "%10.0f %10.0f ",
-                result.real_accumulated_time * 1e9,
-                result.cpu_accumulated_time * 1e9);
-  } else {
-    ColorPrintf(COLOR_YELLOW, "%10.0f %10.0f ",
-                (result.real_accumulated_time * 1e9) /
-                    (static_cast<double>(result.iterations)),
-                (result.cpu_accumulated_time * 1e9) /
-                    (static_cast<double>(result.iterations)));
-  }
-  ColorPrintf(COLOR_CYAN, "%10lld", result.iterations);
-  ColorPrintf(COLOR_DEFAULT, "%*s %*s %s %s\n",
-              13, rate.c_str(),
-              18, items.c_str(),
-              result.report_label.c_str(),
-              PrintMemoryUsage(result.max_heapbytes_used).c_str());
-}
-
-/* TODO(dominic)
-void MemoryUsage() {
-  // if (benchmark_mc) {
-  //  benchmark_mc->Reset();
-  //} else {
-  get_memory_usage = true;
-  //}
-}
-*/
-
-void PrintUsageAndExit() {
-  fprintf(stdout,
-          "benchmark [--benchmark_filter=<regex>]\n"
-          "          [--benchmark_iterations=<iterations>]\n"
-          "          [--benchmark_min_time=<min_time>]\n"
-          //"          [--benchmark_memory_usage]\n"
-          "          [--benchmark_repetitions=<num_repetitions>]\n"
-          "          [--color_print={true|false}]\n"
-          "          [--v=<verbosity>]\n");
-  exit(0);
-}
-
-void ParseCommandLineFlags(int* argc, const char** argv) {
-  for (int i = 1; i < *argc; ++i) {
-    if (ParseStringFlag(argv[i], "benchmark_filter", &FLAGS_benchmark_filter) ||
-        ParseInt32Flag(argv[i], "benchmark_iterations",
-                       &FLAGS_benchmark_iterations) ||
-        ParseDoubleFlag(argv[i], "benchmark_min_time",
-                        &FLAGS_benchmark_min_time) ||
-        // TODO(dominic)
-        //        ParseBoolFlag(argv[i], "gbenchmark_memory_usage",
-        //                      &FLAGS_gbenchmark_memory_usage) ||
-        ParseInt32Flag(argv[i], "benchmark_repetitions",
-                       &FLAGS_benchmark_repetitions) ||
-        ParseBoolFlag(argv[i], "color_print", &FLAGS_color_print) ||
-        ParseInt32Flag(argv[i], "v", &FLAGS_v)) {
-      for (int j = i; j != *argc; ++j) argv[j] = argv[j + 1];
-
-      --(*argc);
-      --i;
-    } else if (IsFlag(argv[i], "help"))
-      PrintUsageAndExit();
-  }
-}
-
-}  // end namespace internal
-
-// A clock that provides a fast mechanism to check if we're nearly done.
-class State::FastClock {
- public:
-  enum Type {
-    REAL_TIME,
-    CPU_TIME
-  };
-  explicit FastClock(Type type)
-      : type_(type),
-        approx_time_(NowMicros()),
-        bg_done_(false),
-        bg_(BGThreadWrapper, this) { }
-
-  ~FastClock() {
-    {
-      std::unique_lock<std::mutex> l(bg_mutex_);
-      bg_done_ = true;
-      bg_cond_.notify_one();
-    }
-    bg_.join();
-  }
-
-  // Returns true if the current time is guaranteed to be past "when_micros".
-  // This method is very fast.
-  inline bool HasReached(int64_t when_micros) {
-    return std::atomic_load(&approx_time_) >= when_micros;
-  }
-
-  // Returns the current time in microseconds past the epoch.
-  int64_t NowMicros() const {
-    double t = 0;
-    switch (type_) {
-      case REAL_TIME:
-        t = walltime::Now();
-        break;
-      case CPU_TIME:
-        t = MyCPUUsage() + ChildrenCPUUsage();
-        break;
-    }
-    return static_cast<int64_t>(t * kNumMicrosPerSecond);
-  }
-
-  // Reinitialize if necessary (since clock type may be change once benchmark
-  // function starts running - see UseRealTime).
-  void InitType(Type type) {
-    type_ = type;
-    std::lock_guard<std::mutex> l(bg_mutex_);
-    std::atomic_store(&approx_time_, NowMicros());
-  }
-
- private:
-  Type type_;
-  std::atomic<int64_t> approx_time_;  // Last time measurement taken by bg_
-  bool bg_done_;  // This is used to signal background thread to exit
-  std::mutex bg_mutex_;
-  std::condition_variable bg_cond_;
-  std::thread bg_;  // Background thread that updates last_time_ once every ms
-
-  static void* BGThreadWrapper(void* that) {
-    ((FastClock*)that)->BGThread();
-    return NULL;
-  }
-
-  void BGThread() {
-    std::unique_lock<std::mutex> l(bg_mutex_);
-    while (!bg_done_)
-    {
-      // Set timeout to 1 ms.
-      bg_cond_.wait_for(l, std::chrono::milliseconds(1));
-      std::atomic_store(&approx_time_, NowMicros());
-    }
-  }
-
-  BENCHMARK_DISALLOW_COPY_AND_ASSIGN(FastClock);
-};
-
-struct State::ThreadStats {
-  int64_t bytes_processed;
-  int64_t items_processed;
-
-  ThreadStats() { Reset(); }
-
-  void Reset() {
-    bytes_processed = 0;
-    items_processed = 0;
-  }
-
-  void Add(const ThreadStats& other) {
-    bytes_processed += other.bytes_processed;
-    items_processed += other.items_processed;
-  }
-};
-
-namespace internal {
-
-// Information kept per benchmark we may want to run
-struct Benchmark::Instance {
-  Instance()
-      : bm(nullptr),
-        threads(1),
-        rangeXset(false),
-        rangeX(kNoRange),
-        rangeYset(false),
-        rangeY(kNoRange) {}
-
-  std::string name;
-  Benchmark* bm;
-  int threads;  // Number of concurrent threads to use
-
-  bool rangeXset;
-  int rangeX;
-  bool rangeYset;
-  int rangeY;
-
-  bool multithreaded() const { return !bm->thread_counts_.empty(); }
-};
-
-}  // end namespace internal
-
-struct State::SharedState {
-  const internal::Benchmark::Instance* instance;
-  std::mutex mu;
-  std::condition_variable cond;
-  int starting;  // Number of threads that have entered STARTING state
-  int stopping;  // Number of threads that have entered STOPPING state
-  int exited;    // Number of threads that have complete exited
-  int threads;   // Number of total threads that are running concurrently
-  ThreadStats stats;
-  std::vector<BenchmarkReporter::Run> runs;  // accumulated runs
-  std::string label;
-
-  explicit SharedState(const internal::Benchmark::Instance* b)
-      : instance(b),
-        starting(0),
-        stopping(0),
-        exited(0),
-        threads(b == nullptr ? 1 : b->threads) { }
-
-  BENCHMARK_DISALLOW_COPY_AND_ASSIGN(SharedState);
-};
-
-namespace internal {
-
-Benchmark::Benchmark(const char* name, BenchmarkFunction f)
-    : name_(name), function_(f) {
+Benchmark::Benchmark(const char* name,
+                     Function* f)
+                    : name_(name), function_(f), arg_count_(-1) {
   registration_index_ = BenchmarkFamilies::GetInstance()->AddBenchmark(this);
 }
 
-Benchmark::~Benchmark() {
+Benchmark::~Benchmark()  {
   BenchmarkFamilies::GetInstance()->RemoveBenchmark(registration_index_);
 }
 
 Benchmark* Benchmark::Arg(int x) {
-  std::lock_guard<std::mutex> l(mutex_);
-  rangeX_.push_back(x);
+  CHECK(arg_count_ == -1 || arg_count_ == 1);
+  arg_count_ = 1;
+  args_.emplace_back(x, -1);
   return this;
 }
 
 Benchmark* Benchmark::Range(int start, int limit) {
+  CHECK(arg_count_ == -1 || arg_count_ == 1);
+  arg_count_ = 1;
   std::vector<int> arglist;
   AddRange(&arglist, start, limit, kRangeMultiplier);
 
-  std::lock_guard<std::mutex> l(mutex_);
-  for (size_t i = 0; i < arglist.size(); ++i) rangeX_.push_back(arglist[i]);
+  for (int i : arglist) {
+    args_.emplace_back(i, -1);
+  }
   return this;
 }
 
 Benchmark* Benchmark::DenseRange(int start, int limit) {
+  CHECK(arg_count_ == -1 || arg_count_ == 1);
+  arg_count_ = 1;
   CHECK_GE(start, 0);
   CHECK_LE(start, limit);
-  std::lock_guard<std::mutex> l(mutex_);
-  for (int arg = start; arg <= limit; ++arg) rangeX_.push_back(arg);
+  for (int arg = start; arg <= limit; arg++) {
+    args_.emplace_back(arg, -1);
+  }
   return this;
 }
 
 Benchmark* Benchmark::ArgPair(int x, int y) {
-  std::lock_guard<std::mutex> l(mutex_);
-  rangeX_.push_back(x);
-  rangeY_.push_back(y);
+  CHECK(arg_count_ == -1 || arg_count_ == 2);
+  arg_count_ = 2;
+  args_.emplace_back(x, y);
   return this;
 }
 
 Benchmark* Benchmark::RangePair(int lo1, int hi1, int lo2, int hi2) {
+  CHECK(arg_count_ == -1 || arg_count_ == 2);
+  arg_count_ = 2;
   std::vector<int> arglist1, arglist2;
   AddRange(&arglist1, lo1, hi1, kRangeMultiplier);
   AddRange(&arglist2, lo2, hi2, kRangeMultiplier);
 
-  std::lock_guard<std::mutex> l(mutex_);
-  rangeX_.resize(arglist1.size());
-  std::copy(arglist1.begin(), arglist1.end(), rangeX_.begin());
-  rangeY_.resize(arglist2.size());
-  std::copy(arglist2.begin(), arglist2.end(), rangeY_.begin());
+  for (int i : arglist1) {
+    for (int j : arglist2) {
+      args_.emplace_back(i, j);
+    }
+  }
   return this;
 }
 
@@ -655,7 +531,6 @@ Benchmark* Benchmark::Apply(void (*custom_arguments)(Benchmark* benchmark)) {
 
 Benchmark* Benchmark::Threads(int t) {
   CHECK_GT(t, 0);
-  std::lock_guard<std::mutex> l(mutex_);
   thread_counts_.push_back(t);
   return this;
 }
@@ -664,14 +539,13 @@ Benchmark* Benchmark::ThreadRange(int min_threads, int max_threads) {
   CHECK_GT(min_threads, 0);
   CHECK_GE(max_threads, min_threads);
 
-  std::lock_guard<std::mutex> l(mutex_);
   AddRange(&thread_counts_, min_threads, max_threads, 2);
   return this;
 }
 
 Benchmark* Benchmark::ThreadPerCpu() {
-  std::lock_guard<std::mutex> l(mutex_);
-  thread_counts_.push_back(NumCPUs());
+  static int num_cpus = NumCPUs();
+  thread_counts_.push_back(num_cpus);
   return this;
 }
 
@@ -682,443 +556,310 @@ void Benchmark::AddRange(std::vector<int>* dst, int lo, int hi, int mult) {
   // Add "lo"
   dst->push_back(lo);
 
+  static const int kint32max = std::numeric_limits<int32_t>::max();
+
   // Now space out the benchmarks in multiples of "mult"
-  for (int32_t i = 1; i < std::numeric_limits<int32_t>::max() / mult;
-       i *= mult) {
+  for (int32_t i = 1; i < kint32max/mult; i *= mult) {
     if (i >= hi) break;
-    if (i > lo) dst->push_back(i);
+    if (i > lo) {
+      dst->push_back(i);
+    }
   }
   // Add "hi" (if different from "lo")
-  if (hi != lo) dst->push_back(hi);
-}
-
-std::vector<Benchmark::Instance> Benchmark::CreateBenchmarkInstances(
-    size_t rangeXindex, size_t rangeYindex) {
-  // Special list of thread counts to use when none are specified
-  std::vector<int> one_thread;
-  one_thread.push_back(1);
-
-  std::vector<Benchmark::Instance> instances;
-
-  const bool is_multithreaded = (!thread_counts_.empty());
-  const std::vector<int>& thread_counts =
-      (is_multithreaded ? thread_counts_ : one_thread);
-  for (int num_threads : thread_counts) {
-    Instance instance;
-    instance.name = name_;
-    instance.bm = this;
-    instance.threads = num_threads;
-
-    if (rangeXindex != kNoRangeIndex) {
-      instance.rangeX = rangeX_[rangeXindex];
-      instance.rangeXset = true;
-      AppendHumanReadable(instance.rangeX, &instance.name);
-    }
-    if (rangeYindex != kNoRangeIndex) {
-      instance.rangeY = rangeY_[rangeYindex];
-      instance.rangeYset = true;
-      AppendHumanReadable(instance.rangeY, &instance.name);
-    }
-
-    // Add the number of threads used to the name
-    if (is_multithreaded) {
-      std::stringstream ss;
-      ss << "/threads:" << instance.threads;
-      instance.name += ss.str();
-    }
-
-    instances.push_back(instance);
+  if (hi != lo) {
+    dst->push_back(hi);
   }
-
-  return instances;
 }
 
-void Benchmark::MeasureOverhead() {
-  State::FastClock clock(State::FastClock::CPU_TIME);
-  State::SharedState state(nullptr);
-  State runner(&clock, &state, 0);
-  while (runner.KeepRunning()) {
-  }
-  overhead = state.runs[0].real_accumulated_time /
-             static_cast<double>(state.runs[0].iterations);
-  VLOG(1) << "Per-iteration overhead for doing nothing: " << overhead << "\n";
-}
+} // end namespace internal
 
-void Benchmark::RunInstance(const Instance& b, const BenchmarkReporter* br) {
-  use_real_time = false;
-  running_benchmark = true;
-  // get_memory_usage = FLAGS_gbenchmark_memory_usage;
-  State::FastClock clock(State::FastClock::CPU_TIME);
+namespace {
 
-  // Initialize the test runners.
-  State::SharedState state(&b);
+
+// Execute one thread of benchmark b for the specified number of iterations.
+// Adds the stats collected for the thread into *total.
+void RunInThread(const benchmark::internal::Benchmark::Instance* b,
+                 int iters, int thread_id,
+                 ThreadStats* total) EXCLUDES(GetBenchmarkLock()) {
+  State st(iters, b->has_arg1, b->arg1, b->has_arg2, b->arg2, thread_id);
+  b->function(st);
+  CHECK(st.iterations() == st.max_iterations) <<
+    "Benchmark returned before State::KeepRunning() returned false!";
   {
-    std::vector<std::unique_ptr<State>> runners;
-    for (int i = 0; i < b.threads; ++i)
-      runners.push_back(std::unique_ptr<State>(new State(&clock, &state, i)));
-
-    // Run them all.
-    for (int i = 0; i < b.threads; ++i) {
-      if (b.multithreaded())
-        runners[i]->RunAsThread();
-      else
-        runners[i]->Run();
-    }
-    if (b.multithreaded()) {
-      for (int i = 0; i < b.threads; ++i) runners[i]->Wait();
-    }
+    MutexLock l(GetBenchmarkLock());
+    total->bytes_processed += st.bytes_processed();
+    total->items_processed += st.items_processed();
   }
-  /*
-    double mem_usage = 0;
-    if (get_memory_usage) {
-      // Measure memory usage
-      Notification mem_done;
-      BenchmarkRun mem_run;
-      BenchmarkRun::SharedState mem_shared(&b, 1);
-      mem_run.Init(&clock, &mem_shared, 0);
+
+  timer_manager->Finalize();
+}
+
+void RunBenchmark(const benchmark::internal::Benchmark::Instance& b,
+                  const BenchmarkReporter* br) EXCLUDES(GetBenchmarkLock()) {
+  int iters = FLAGS_benchmark_iterations ? FLAGS_benchmark_iterations
+                                         : 1;
+  std::vector<BenchmarkReporter::Run> reports;
+
+  std::vector<std::thread> pool;
+  if (b.multithreaded)
+    pool.resize(b.threads);
+
+  for (int i = 0; i < FLAGS_benchmark_repetitions; i++) {
+    std::string mem;
+    while (true) {
+      // Try benchmark
+      VLOG(2) << "Running " << b.name << " for " << iters << "\n";
+
       {
-        testing::MallocCounter mc(testing::MallocCounter::THIS_THREAD_ONLY);
-        benchmark_mc = &mc;
-        mem_run.Run(&mem_done);
-        mem_done.WaitForNotification();
-        benchmark_mc = NULL;
-        mem_usage = mc.PeakHeapGrowth();
+        MutexLock l(GetBenchmarkLock());
+        GetReportLabel()->clear();
+        use_real_time = false;
       }
-    }
-  */
-  running_benchmark = false;
 
-  for (BenchmarkReporter::Run& report : state.runs) {
-    double seconds = (use_real_time ? report.real_accumulated_time
-                                    : report.cpu_accumulated_time);
-    report.benchmark_name = b.name;
-    report.report_label = state.label;
-    report.bytes_per_second = state.stats.bytes_processed / seconds;
-    report.items_per_second = state.stats.items_processed / seconds;
-    report.max_heapbytes_used = MeasurePeakHeapMemory(b);
-  }
+      Notification done;
+      timer_manager = std::unique_ptr<TimerManager>(new TimerManager(b.threads, &done));
 
-  br->ReportRuns(state.runs);
-}
+      ThreadStats total;
+      running_benchmark = true;
+      if (b.multithreaded) {
+        // If this is out first iteration of the while(true) loop then the
+        // threads haven't been started and can't be joined. Otherwise we need
+        // to join the thread before replacing them.
+        for (std::thread& thread : pool) {
+          if (thread.joinable())
+            thread.join();
+        }
+        for (std::size_t ti = 0; ti < pool.size(); ++ti) {
+            pool[ti] = std::thread(&RunInThread, &b, iters, ti, &total);
+        }
+      } else {
+        // Run directly in this thread
+        RunInThread(&b, iters, 0, &total);
+      }
+      done.WaitForNotification();
+      running_benchmark = false;
 
-// Run the specified benchmark, measure its peak memory usage, and
-// return the peak memory usage.
-double Benchmark::MeasurePeakHeapMemory(const Instance&) {
-  if (!get_memory_usage) return 0.0;
-  double bytes = 0.0;
-  /*  TODO(dominich)
-   // Should we do multi-threaded runs?
-   const int num_threads = 1;
-   const int num_iters = 1;
-   {
- //    internal::MallocCounter mc(internal::MallocCounter::THIS_THREAD_ONLY);
-     running_benchmark = true;
-     timer_manager = new TimerManager(1, NULL);
- //    benchmark_mc = &mc;
-     timer_manager->StartTimer();
+      const double cpu_accumulated_time = timer_manager->cpu_time_used();
+      const double real_accumulated_time = timer_manager->real_time_used();
+      timer_manager.reset();
 
-     b.Run(num_iters);
+      VLOG(2) << "Ran in " << cpu_accumulated_time << "/"
+              << real_accumulated_time << "\n";
 
-     running_benchmark = false;
-     delete timer_manager;
-     timer_manager = NULL;
- //    benchmark_mc = NULL;
- //    bytes = mc.PeakHeapGrowth();
-   }
-   */
-  return bytes;
-}
+      // Base decisions off of real time if requested by this benchmark.
+      double seconds = cpu_accumulated_time;
+      std::string label;
+      {
+        MutexLock l(GetBenchmarkLock());
+        label = *GetReportLabel();
+        if (use_real_time) {
+          seconds = real_accumulated_time;
+        }
+      }
 
-}  // end namespace internal
+      // If this was the first run, was elapsed time or cpu time large enough?
+      // If this is not the first run, go with the current value of iter.
+      if ((i > 0) ||
+          (iters == FLAGS_benchmark_iterations) ||
+          (iters >= kMaxIterations) ||
+          (seconds >= FLAGS_benchmark_min_time) ||
+          (real_accumulated_time >= 5*FLAGS_benchmark_min_time)) {
+        double bytes_per_second = 0;
+        if (total.bytes_processed > 0 && seconds != 0.0) {
+          bytes_per_second = (total.bytes_processed / seconds);
+        }
+        double items_per_second = 0;
+        if (total.items_processed > 0 && seconds != 0.0) {
+          items_per_second = (total.items_processed / seconds);
+        }
 
-State::State(FastClock* clock, SharedState* s, int t)
-    : thread_index(t),
-      state_(STATE_INITIAL),
-      clock_(clock),
-      shared_(s),
-      iterations_(0),
-      start_cpu_(0.0),
-      start_time_(0.0),
-      stop_time_micros_(0.0),
-      start_pause_cpu_(0.0),
-      pause_cpu_time_(0.0),
-      start_pause_real_(0.0),
-      pause_real_time_(0.0),
-      total_iterations_(0),
-      interval_micros_(static_cast<int64_t>(kNumMicrosPerSecond *
-                                            FLAGS_benchmark_min_time /
-                                            FLAGS_benchmark_repetitions)),
-      is_continuation_(false),
-      stats_(new ThreadStats()) {
-  CHECK(clock != nullptr);
-  CHECK(s != nullptr);
-}
+        // Create report about this benchmark run.
+        BenchmarkReporter::Run report;
+        report.benchmark_name = b.name;
+        report.report_label = label;
+        // Report the total iterations across all threads.
+        report.iterations = static_cast<int64_t>(iters) * b.threads;
+        report.real_accumulated_time = real_accumulated_time;
+        report.cpu_accumulated_time = cpu_accumulated_time;
+        report.bytes_per_second = bytes_per_second;
+        report.items_per_second = items_per_second;
+        reports.push_back(report);
+        break;
+      }
 
-bool State::KeepRunning() {
-  // Fast path
-  if ((FLAGS_benchmark_iterations == 0 &&
-       !clock_->HasReached(stop_time_micros_ +
-                           kNumMicrosPerSecond * pause_real_time_)) ||
-      iterations_ < FLAGS_benchmark_iterations) {
-    ++iterations_;
-    return true;
-  }
-
-  // To block thread 0 until all other threads exit, we have a signal exit
-  // point for KeepRunning() to return false.  The fast path above always
-  // returns true.
-  bool ret = false;
-  switch (state_) {
-    case STATE_INITIAL:
-      ret = StartRunning();
-      break;
-    case STATE_STARTING:
-      CHECK(false);
-      ret = true;
-      break;
-    case STATE_RUNNING:
-      ret = FinishInterval();
-      break;
-    case STATE_STOPPING:
-      ret = MaybeStop();
-      break;
-    case STATE_STOPPED:
-      CHECK(false);
-      ret = true;
-      break;
-  }
-
-  if (!ret && shared_->threads > 1 && thread_index == 0){
-    std::unique_lock<std::mutex> l(shared_->mu);
-
-    // Block until all other threads have exited.  We can then safely cleanup
-    // without other threads continuing to access shared variables inside the
-    // user-provided run function.
-    while (shared_->exited < shared_->threads - 1) {
-      shared_->cond.wait(l);
+      // See how much iterations should be increased by
+      // Note: Avoid division by zero with max(seconds, 1ns).
+      double multiplier = FLAGS_benchmark_min_time * 1.4 / std::max(seconds, 1e-9);
+      // If our last run was at least 10% of FLAGS_benchmark_min_time then we
+      // use the multiplier directly. Otherwise we use at most 10 times
+      // expansion.
+      // NOTE: When the last run was at least 10% of the min time the max
+      // expansion should be 14x.
+      bool is_significant = (seconds / FLAGS_benchmark_min_time) > 0.1;
+      multiplier = is_significant ? multiplier : std::min(10.0, multiplier);
+      if (multiplier <= 1.0) multiplier = 2.0;
+      double next_iters = std::max(multiplier * iters, iters + 1.0);
+      if (next_iters > kMaxIterations) {
+        next_iters = kMaxIterations;
+      }
+      VLOG(3) << "Next iters: " << next_iters << ", " << multiplier << "\n";
+      iters = static_cast<int>(next_iters + 0.5);
     }
   }
-
-  if (ret) {
-    ++iterations_;
+  br->ReportRuns(reports);
+  if (b.multithreaded) {
+    for (std::thread& thread : pool)
+      thread.join();
   }
-  return ret;
+}
+
+}  // namespace
+
+State::State(size_t max_iters, bool has_x, int x, bool has_y, int y,
+             int thread_i)
+    : started_(false), total_iterations_(0),
+      has_range_x_(has_x), range_x_(x),
+      has_range_y_(has_y), range_y_(y),
+      bytes_processed_(0), items_processed_(0),
+      thread_index(thread_i),
+      max_iterations(max_iters)
+{
+    CHECK(max_iterations != 0) << "At least one iteration must be run";
 }
 
 void State::PauseTiming() {
-  start_pause_cpu_ = MyCPUUsage() + ChildrenCPUUsage();
-  start_pause_real_ = walltime::Now();
+  // Add in time accumulated so far
+  CHECK(running_benchmark);
+  timer_manager->StopTimer();
 }
 
 void State::ResumeTiming() {
-  pause_cpu_time_ += MyCPUUsage() + ChildrenCPUUsage() - start_pause_cpu_;
-  pause_real_time_ += walltime::Now() - start_pause_real_;
+  CHECK(running_benchmark);
+  timer_manager->StartTimer();
 }
 
-void State::SetBytesProcessed(int64_t bytes) {
-  CHECK_EQ(STATE_STOPPED, state_);
-  std::lock_guard<std::mutex> l(shared_->mu);
-  stats_->bytes_processed = bytes;
+void State::UseRealTime() {
+  MutexLock l(GetBenchmarkLock());
+  use_real_time = true;
 }
 
-void State::SetItemsProcessed(int64_t items) {
-  CHECK_EQ(STATE_STOPPED, state_);
-  std::lock_guard<std::mutex> l(shared_->mu);
-  stats_->items_processed = items;
+void State::SetLabel(const char* label) {
+  CHECK(running_benchmark);
+  MutexLock l(GetBenchmarkLock());
+  *GetReportLabel() = label;
 }
 
-void State::SetLabel(const std::string& label) {
-  CHECK_EQ(STATE_STOPPED, state_);
-  std::lock_guard<std::mutex> l(shared_->mu);
-  shared_->label = label;
-}
-
-int State::range_x() const {
-  CHECK(shared_->instance->rangeXset);
-  /*
-  <<
-      "Failed to get range_x as it was not set. Did you register your "
-      "benchmark with a range parameter?";
-      */
-  return shared_->instance->rangeX;
-}
-
-int State::range_y() const {
-  CHECK(shared_->instance->rangeYset);
-  /* <<
-       "Failed to get range_y as it was not set. Did you register your "
-       "benchmark with a range parameter?";
-       */
-  return shared_->instance->rangeY;
-}
-
-bool State::StartRunning() {
-  bool last_thread = false;
-  {
-    std::lock_guard<std::mutex> l(shared_->mu);
-    CHECK_EQ(state_, STATE_INITIAL);
-    state_ = STATE_STARTING;
-    is_continuation_ = false;
-    CHECK_LT(shared_->starting, shared_->threads);
-    ++shared_->starting;
-    last_thread = shared_->starting == shared_->threads;
-  }
-
-  if (last_thread) {
-    clock_->InitType(use_real_time ? FastClock::REAL_TIME
-                                   : FastClock::CPU_TIME);
-    {
-      std::lock_guard<std::mutex> l(starting_mutex);
-      starting_cv.notify_all();
-    }
-  } else {
-    std::unique_lock<std::mutex> l(starting_mutex);
-    starting_cv.wait(l);
-  }
-  CHECK_EQ(state_, STATE_STARTING);
-  state_ = STATE_RUNNING;
-
-  NewInterval();
-  return true;
-}
-
-void State::NewInterval() {
-  stop_time_micros_ = clock_->NowMicros() + interval_micros_;
-  if (!is_continuation_) {
-    VLOG(1) << "Starting new interval; stopping in " << interval_micros_
-            << "\n";
-    iterations_ = 0;
-    pause_cpu_time_ = 0;
-    pause_real_time_ = 0;
-    start_cpu_ = MyCPUUsage() + ChildrenCPUUsage();
-    start_time_ = walltime::Now();
-  } else {
-    VLOG(1) << "Continuing interval; stopping in " << interval_micros_
-            << "\n";
-  }
-}
-
-bool State::FinishInterval() {
-  if ((FLAGS_benchmark_iterations != 0 &&
-       iterations_ <
-           FLAGS_benchmark_iterations / FLAGS_benchmark_repetitions) ||
-      iterations_ < 1) {
-    interval_micros_ *= 2;
-    VLOG(1) << "Not enough iterations in interval; "
-            << "Trying again for " << interval_micros_ << " useconds.\n";
-    is_continuation_ = false;
-    NewInterval();
-    return true;
-  }
-
-  BenchmarkReporter::Run data;
-  data.iterations = iterations_;
-  data.thread_index = thread_index;
-
-  const double accumulated_time = walltime::Now() - start_time_;
-  const double total_overhead = overhead * iterations_;
-  CHECK_LT(pause_real_time_, accumulated_time);
-  CHECK_LT(pause_real_time_ + total_overhead, accumulated_time);
-  data.real_accumulated_time =
-      accumulated_time - (pause_real_time_ + total_overhead);
-  data.cpu_accumulated_time = (MyCPUUsage() + ChildrenCPUUsage()) -
-                              (pause_cpu_time_ + start_cpu_);
-  total_iterations_ += iterations_;
-
-  bool keep_going = false;
-  {
-    std::lock_guard<std::mutex> l(shared_->mu);
-
-    // Either replace the last or add a new data point.
-    if (is_continuation_)
-      shared_->runs.back() = data;
-    else
-      shared_->runs.push_back(data);
-
-    if (FLAGS_benchmark_iterations != 0) {
-      // If we need more iterations, run another interval as a continuation.
-      keep_going = total_iterations_ < FLAGS_benchmark_iterations;
-      is_continuation_ = keep_going;
-    } else {
-      // If this is a repetition, run another interval as a new data point.
-      keep_going = shared_->runs.size() <
-                   static_cast<size_t>(FLAGS_benchmark_repetitions);
-      is_continuation_ = !keep_going;
-    }
-
-    if (!keep_going) {
-      ++shared_->stopping;
-      if (shared_->stopping < shared_->threads) {
-        // Other threads are still running, so continue running but without
-        // timing to present an expected background load to the other threads.
-        state_ = STATE_STOPPING;
-        keep_going = true;
-      } else {
-        state_ = STATE_STOPPED;
-      }
-    }
-  }
-
-  if (state_ == STATE_RUNNING) NewInterval();
-  return keep_going;
-}
-
-bool State::MaybeStop() {
-  std::lock_guard<std::mutex> l(shared_->mu);
-  if (shared_->stopping < shared_->threads) {
-    CHECK_EQ(state_, STATE_STOPPING);
-    return true;
-  }
-  state_ = STATE_STOPPED;
-  return false;
-}
-
-void State::Run() {
-  stats_->Reset();
-  shared_->instance->bm->function_(*this);
-  {
-    std::lock_guard<std::mutex> l(shared_->mu);
-    shared_->stats.Add(*stats_);
-  }
-}
-
-void State::RunAsThread() {
-  thread_ = std::thread(State::RunWrapper, this);
-}
-
-void State::Wait() {
-  if (thread_.joinable()) {
-    thread_.join();
-  }
-}
-
-// static
-void* State::RunWrapper(void* arg) {
-  State* that = (State*)arg;
-  CHECK(that != nullptr);
-  that->Run();
-
-  std::lock_guard<std::mutex> l(that->shared_->mu);
-
-  that->shared_->exited++;
-  if (that->thread_index > 0 &&
-      that->shared_->exited == that->shared_->threads - 1) {
-    // All threads but thread 0 have exited the user-provided run function.
-    // Thread 0 can now wake up and exit.
-    that->shared_->cond.notify_one();
-  }
-
-  return nullptr;
-}
+BenchmarkReporter::~BenchmarkReporter() {}
 
 namespace internal {
 
+bool ConsoleReporter::ReportContext(const Context& context) const {
+  name_field_width_ = context.name_field_width;
+
+  fprintf(stdout,
+          "Run on (%d X %0.0f MHz CPU%s)\n",
+          context.num_cpus,
+          context.mhz_per_cpu,
+          (context.num_cpus > 1) ? "s" : "");
+
+  int remainder_us;
+  std::string walltime_str = walltime::Print(
+                                walltime::Now(), "%Y/%m/%d-%H:%M:%S",
+                                true,  // use local timezone
+                                &remainder_us);
+  fprintf(stdout, "%s\n", walltime_str.c_str());
+
+  if (context.cpu_scaling_enabled) {
+    fprintf(stdout, "***WARNING*** CPU scaling is enabled, the benchmark "
+                    "timings may be noisy\n");
+  }
+
+#ifndef NDEBUG
+  fprintf(stdout, "Build Type: DEBUG\n");
+#endif
+
+  int output_width =
+      fprintf(stdout,
+              "%-*s %10s %10s %10s\n",
+              static_cast<int>(name_field_width_),
+              "Benchmark",
+              "Time(ns)", "CPU(ns)",
+              "Iterations");
+  fprintf(stdout, "%s\n", std::string(output_width - 1, '-').c_str());
+
+  return true;
+}
+
+void ConsoleReporter::ReportRuns(
+    const std::vector<Run>& reports) const {
+  if (reports.empty()) {
+    return;
+  }
+
+  for (Run const& run : reports) {
+    CHECK_EQ(reports[0].benchmark_name, run.benchmark_name);
+    PrintRunData(run);
+  }
+
+  if (reports.size() < 2) {
+    // We don't report aggregated data if there was a single run.
+    return;
+  }
+
+  Run mean_data;
+  Run stddev_data;
+  ComputeStats(reports, &mean_data, &stddev_data);
+
+  // Output using PrintRun.
+  PrintRunData(mean_data);
+  PrintRunData(stddev_data);
+  fprintf(stdout, "\n");
+}
+
+void ConsoleReporter::PrintRunData(const Run& result) const {
+  // Format bytes per second
+  std::string rate;
+  if (result.bytes_per_second > 0) {
+    rate = StrCat(" ", HumanReadableNumber(result.bytes_per_second), "B/s");
+  }
+
+  // Format items per second
+  std::string items;
+  if (result.items_per_second > 0) {
+    items = StrCat(" ", HumanReadableNumber(result.items_per_second),
+                   " items/s");
+  }
+
+  double const multiplier = 1e9; // nano second multiplier
+  ColorPrintf(COLOR_GREEN, "%-*s ",
+              name_field_width_, result.benchmark_name.c_str());
+  if (result.iterations == 0) {
+    ColorPrintf(COLOR_YELLOW, "%10.0f %10.0f ",
+                result.real_accumulated_time * multiplier,
+                result.cpu_accumulated_time * multiplier);
+  } else {
+    ColorPrintf(COLOR_YELLOW, "%10.0f %10.0f ",
+                (result.real_accumulated_time * multiplier) /
+                    (static_cast<double>(result.iterations)),
+                (result.cpu_accumulated_time * multiplier) /
+                    (static_cast<double>(result.iterations)));
+  }
+  ColorPrintf(COLOR_CYAN, "%10lld", result.iterations);
+  ColorPrintf(COLOR_DEFAULT, "%*s %*s %s\n",
+              13, rate.c_str(),
+              18, items.c_str(),
+              result.report_label.c_str());
+}
+
 void RunMatchingBenchmarks(const std::string& spec,
                            const BenchmarkReporter* reporter) {
+  CHECK(reporter != nullptr);
   if (spec.empty()) return;
 
-  std::vector<internal::Benchmark::Instance> benchmarks;
-  BenchmarkFamilies::GetInstance()->FindBenchmarks(spec, &benchmarks);
+  std::vector<benchmark::internal::Benchmark::Instance> benchmarks;
+  auto families = benchmark::internal::BenchmarkFamilies::GetInstance();
+  if (!families->FindBenchmarks(spec, &benchmarks)) return;
+
 
   // Determine the width of the name field using a minimum width of 10.
   // Also determine max number of threads needed.
@@ -1144,45 +885,78 @@ void RunMatchingBenchmarks(const std::string& spec,
   BenchmarkReporter::Context context;
   context.num_cpus = NumCPUs();
   context.mhz_per_cpu = CyclesPerSecond() / 1000000.0f;
-  //  context.cpu_info = base::CompactCPUIDInfoString();
+
   context.cpu_scaling_enabled = CpuScalingEnabled();
   context.name_field_width = name_field_width;
 
-  if (reporter->ReportContext(context))
-    for (internal::Benchmark::Instance& benchmark : benchmarks)
-      Benchmark::RunInstance(benchmark, reporter);
+  if (reporter->ReportContext(context)) {
+    for (const auto& benchmark : benchmarks) {
+      RunBenchmark(benchmark, reporter);
+    }
+  }
 }
 
-void FindMatchingBenchmarkNames(const std::string& spec,
-                                std::vector<std::string>* benchmark_names) {
-  if (spec.empty()) return;
+} // end namespace internal
 
-  std::vector<internal::Benchmark::Instance> benchmarks;
-  BenchmarkFamilies::GetInstance()->FindBenchmarks(spec, &benchmarks);
-  std::transform(benchmarks.begin(), benchmarks.end(), benchmark_names->begin(),
-                 [](const internal::Benchmark::Instance& b) { return b.name; });
-}
 
-}  // end namespace internal
-
-void RunSpecifiedBenchmarks(const BenchmarkReporter* reporter /*= nullptr*/) {
+void RunSpecifiedBenchmarks(const BenchmarkReporter* reporter) {
   std::string spec = FLAGS_benchmark_filter;
   if (spec.empty() || spec == "all")
     spec = ".";  // Regexp that matches all benchmarks
   internal::ConsoleReporter default_reporter;
-  internal::RunMatchingBenchmarks(
-      spec, reporter == nullptr ? &default_reporter : reporter);
+  internal::RunMatchingBenchmarks(spec, reporter ? reporter : &default_reporter);
 }
 
-void UseRealTime() { use_real_time = true; }
+namespace internal {
+
+void PrintUsageAndExit() {
+  fprintf(stdout,
+          "benchmark"
+          " [--benchmark_filter=<regex>]\n"
+          "          [--benchmark_iterations=<iterations>]\n"
+          "          [--benchmark_min_time=<min_time>]\n"
+          "          [--benchmark_repetitions=<num_repetitions>]\n"
+          "          [--color_print={true|false}]\n"
+          "          [--v=<verbosity>]\n");
+  exit(0);
+}
+
+void ParseCommandLineFlags(int* argc, const char** argv) {
+  using namespace benchmark;
+  for (int i = 1; i < *argc; ++i) {
+    if (
+        ParseStringFlag(argv[i], "benchmark_filter",
+                        &FLAGS_benchmark_filter) ||
+        ParseInt32Flag(argv[i], "benchmark_iterations",
+                       &FLAGS_benchmark_iterations) ||
+        ParseDoubleFlag(argv[i], "benchmark_min_time",
+                        &FLAGS_benchmark_min_time) ||
+        ParseInt32Flag(argv[i], "benchmark_repetitions",
+                       &FLAGS_benchmark_repetitions) ||
+        ParseBoolFlag(argv[i], "color_print",
+                       &FLAGS_color_print) ||
+        ParseInt32Flag(argv[i], "v", &FLAGS_v)) {
+      for (int j = i; j != *argc; ++j) argv[j] = argv[j + 1];
+
+      --(*argc);
+      --i;
+    } else if (IsFlag(argv[i], "help")) {
+      PrintUsageAndExit();
+    }
+  }
+}
+
+} // end namespace internal
 
 void Initialize(int* argc, const char** argv) {
   internal::ParseCommandLineFlags(argc, argv);
   internal::SetLogLevel(FLAGS_v);
-  // Ensure walltime is initialized by a single thread by forcing the
-  // initialization.
+  // TODO remove this. It prints some output the first time it is called.
+  // We don't want to have this ouput printed during benchmarking.
+  MyCPUUsage();
+  // The first call to walltime::Now initialized it. Call it once to
+  // prevent the initialization from happening in a benchmark.
   walltime::Now();
-  internal::Benchmark::MeasureOverhead();
 }
 
-}  // end namespace benchmark
+} // end namespace benchmark
diff --git a/src/mutex.h b/src/mutex.h
new file mode 100644
index 00000000..f37ec35b
--- /dev/null
+++ b/src/mutex.h
@@ -0,0 +1,142 @@
+#ifndef BENCHMARK_MUTEX_H_
+#define BENCHMARK_MUTEX_H_
+
+#include <mutex>
+#include <condition_variable>
+
+// Enable thread safety attributes only with clang.
+// The attributes can be safely erased when compiling with other compilers.
+#if defined(HAVE_THREAD_SAFETY_ATTRIBUTES)
+#define THREAD_ANNOTATION_ATTRIBUTE__(x)   __attribute__((x))
+#else
+#define THREAD_ANNOTATION_ATTRIBUTE__(x)   // no-op
+#endif
+
+#define CAPABILITY(x) \
+  THREAD_ANNOTATION_ATTRIBUTE__(capability(x))
+
+#define SCOPED_CAPABILITY \
+  THREAD_ANNOTATION_ATTRIBUTE__(scoped_lockable)
+
+#define GUARDED_BY(x) \
+  THREAD_ANNOTATION_ATTRIBUTE__(guarded_by(x))
+
+#define PT_GUARDED_BY(x) \
+  THREAD_ANNOTATION_ATTRIBUTE__(pt_guarded_by(x))
+
+#define ACQUIRED_BEFORE(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(acquired_before(__VA_ARGS__))
+
+#define ACQUIRED_AFTER(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(acquired_after(__VA_ARGS__))
+
+#define REQUIRES(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(requires_capability(__VA_ARGS__))
+
+#define REQUIRES_SHARED(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(requires_shared_capability(__VA_ARGS__))
+
+#define ACQUIRE(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(acquire_capability(__VA_ARGS__))
+
+#define ACQUIRE_SHARED(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(acquire_shared_capability(__VA_ARGS__))
+
+#define RELEASE(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(release_capability(__VA_ARGS__))
+
+#define RELEASE_SHARED(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(release_shared_capability(__VA_ARGS__))
+
+#define TRY_ACQUIRE(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(try_acquire_capability(__VA_ARGS__))
+
+#define TRY_ACQUIRE_SHARED(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(try_acquire_shared_capability(__VA_ARGS__))
+
+#define EXCLUDES(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(locks_excluded(__VA_ARGS__))
+
+#define ASSERT_CAPABILITY(x) \
+  THREAD_ANNOTATION_ATTRIBUTE__(assert_capability(x))
+
+#define ASSERT_SHARED_CAPABILITY(x) \
+  THREAD_ANNOTATION_ATTRIBUTE__(assert_shared_capability(x))
+
+#define RETURN_CAPABILITY(x) \
+  THREAD_ANNOTATION_ATTRIBUTE__(lock_returned(x))
+
+#define NO_THREAD_SAFETY_ANALYSIS \
+  THREAD_ANNOTATION_ATTRIBUTE__(no_thread_safety_analysis)
+
+
+namespace benchmark {
+
+typedef std::condition_variable Condition;
+
+// NOTE: Wrappers for std::mutex and std::unique_lock are provided so that
+// we can annotate them with thread safety attributes and use the
+// -Wthread-safety warning with clang. The standard library types cannot be
+// used directly because they do not provided the required annotations.
+class CAPABILITY("mutex") Mutex
+{
+public:
+  Mutex() {}
+
+  void lock() ACQUIRE() { mut_.lock(); }
+  void unlock() RELEASE() { mut_.unlock(); }
+  std::mutex& native_handle() {
+    return mut_;
+  }
+private:
+  std::mutex mut_;
+};
+
+
+class SCOPED_CAPABILITY MutexLock
+{
+  typedef std::unique_lock<std::mutex> MutexLockImp;
+public:
+  MutexLock(Mutex& m) ACQUIRE(m) : ml_(m.native_handle())
+  { }
+  ~MutexLock() RELEASE() {}
+  MutexLockImp& native_handle() { return ml_; }
+private:
+  MutexLockImp ml_;
+};
+
+
+class Notification
+{
+public:
+  Notification() : notified_yet_(false) { }
+
+  void WaitForNotification() const EXCLUDES(mutex_) {
+    MutexLock m_lock(mutex_);
+    auto notified_fn = [this]() REQUIRES(mutex_) {
+                            return this->HasBeenNotified();
+                        };
+    cv_.wait(m_lock.native_handle(), notified_fn);
+  }
+
+  void Notify() EXCLUDES(mutex_) {
+    {
+      MutexLock lock(mutex_);
+      notified_yet_ = 1;
+    }
+    cv_.notify_all();
+  }
+
+private:
+  bool HasBeenNotified() const REQUIRES(mutex_) {
+    return notified_yet_;
+  }
+
+  mutable Mutex mutex_;
+  mutable std::condition_variable cv_;
+  bool notified_yet_ GUARDED_BY(mutex_);
+};
+
+} // end namespace benchmark
+
+#endif // BENCHMARK_MUTEX_H_
diff --git a/src/string_util.cc b/src/string_util.cc
index 1be15341..ee1badc8 100644
--- a/src/string_util.cc
+++ b/src/string_util.cc
@@ -24,13 +24,13 @@ static_assert(arraysize(kBigSIUnits) == arraysize(kBigIECUnits),
 static_assert(arraysize(kSmallSIUnits) == arraysize(kBigSIUnits),
               "Small SI and Big SI unit arrays must be the same size");
 
-static const int kUnitsSize = arraysize(kBigSIUnits);
+static const int64_t kUnitsSize = arraysize(kBigSIUnits);
 
 } // end anonymous namespace
 
 void ToExponentAndMantissa(double val, double thresh, int precision,
                            double one_k, std::string* mantissa,
-                           int* exponent) {
+                           int64_t* exponent) {
   std::stringstream mantissa_stream;
 
   if (val < 0) {
@@ -80,10 +80,10 @@ void ToExponentAndMantissa(double val, double thresh, int precision,
   *mantissa = mantissa_stream.str();
 }
 
-std::string ExponentToPrefix(int exponent, bool iec) {
+std::string ExponentToPrefix(int64_t exponent, bool iec) {
   if (exponent == 0) return "";
 
-  const int index = (exponent > 0 ? exponent - 1 : -exponent - 1);
+  const int64_t index = (exponent > 0 ? exponent - 1 : -exponent - 1);
   if (index >= kUnitsSize) return "";
 
   const char* array =
@@ -97,7 +97,7 @@ std::string ExponentToPrefix(int exponent, bool iec) {
 std::string ToBinaryStringFullySpecified(double value, double threshold,
                                          int precision) {
   std::string mantissa;
-  int exponent;
+  int64_t exponent;
   ToExponentAndMantissa(value, threshold, precision, 1024.0, &mantissa,
                         &exponent);
   return mantissa + ExponentToPrefix(exponent, false);
diff --git a/src/sysinfo.cc b/src/sysinfo.cc
index ee3c238e..ace7caa4 100644
--- a/src/sysinfo.cc
+++ b/src/sysinfo.cc
@@ -34,6 +34,7 @@
 #include "check.h"
 #include "cycleclock.h"
 #include "internal_macros.h"
+#include "log.h"
 #include "sleep.h"
 
 namespace benchmark {
@@ -322,7 +323,7 @@ double MyCPUUsage() {
         return value;
       }
       // Once MyCPUUsageCPUTimeNsLocked fails once fall back to getrusage().
-      std::cout << "Reading /proc/self/cputime_ns failed. Using getrusage().\n";
+      VLOG(1) << "Reading /proc/self/cputime_ns failed. Using getrusage().\n";
       use_cputime_ns = false;
     }
   }
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 5d4721be..bc62f432 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -20,3 +20,6 @@ add_test(filter_regex_none filter_test --benchmark_filter=monkey 0)
 add_test(filter_regex_wildcard filter_test --benchmark_filter=.*Calculate.* 16)
 add_test(filter_regex_begin filter_test --benchmark_filter=^BM_Calculate.* 16)
 add_test(filter_regex_end filter_test --benchmark_filter=.*Pi$ 8)
+
+compile_benchmark_test(basic_test)
+add_test(basic basic_test)
diff --git a/test/basic_test.cc b/test/basic_test.cc
new file mode 100644
index 00000000..d14f577a
--- /dev/null
+++ b/test/basic_test.cc
@@ -0,0 +1,105 @@
+
+#include <cstddef>
+
+#include "benchmark/benchmark.h"
+
+#define BASIC_BENCHMARK_TEST(x) \
+    BENCHMARK(x)->Arg(8)->Arg(512)->Arg(8192)
+
+void BM_empty(benchmark::State& state) {
+  while (state.KeepRunning()) {
+    volatile std::size_t x = state.iterations();
+    ((void)x);
+  }
+}
+BENCHMARK(BM_empty);
+BENCHMARK(BM_empty)->ThreadPerCpu();
+
+void BM_spin_empty(benchmark::State& state) {
+  while (state.KeepRunning()) {
+    for (int x = 0; x < state.range_x(); ++x) {
+      volatile int dummy = x;
+      ((void)dummy);
+    }
+  }
+}
+BASIC_BENCHMARK_TEST(BM_spin_empty);
+BASIC_BENCHMARK_TEST(BM_spin_empty)->ThreadPerCpu();
+
+void BM_spin_pause_before(benchmark::State& state) {
+  for (int i = 0; i < state.range_x(); ++i) {
+    volatile int dummy = i;
+    ((void)dummy);
+  }
+  while(state.KeepRunning()) {
+    for (int i = 0; i < state.range_x(); ++i) {
+      volatile int dummy = i;
+      ((void)dummy);
+    }
+  }
+}
+BASIC_BENCHMARK_TEST(BM_spin_pause_before);
+BASIC_BENCHMARK_TEST(BM_spin_pause_before)->ThreadPerCpu();
+
+
+void BM_spin_pause_during(benchmark::State& state) {
+  while(state.KeepRunning()) {
+    state.PauseTiming();
+    for (int i = 0; i < state.range_x(); ++i) {
+      volatile int dummy = i;
+      ((void)dummy);
+    }
+    state.ResumeTiming();
+    for (int i = 0; i < state.range_x(); ++i) {
+      volatile int dummy = i;
+      ((void)dummy);
+    }
+  }
+}
+BASIC_BENCHMARK_TEST(BM_spin_pause_during);
+BASIC_BENCHMARK_TEST(BM_spin_pause_during)->ThreadPerCpu();
+
+
+void BM_spin_pause_after(benchmark::State& state) {
+  while(state.KeepRunning()) {
+    for (int i = 0; i < state.range_x(); ++i) {
+      volatile int dummy = i;
+      ((void)dummy);
+    }
+  }
+  for (int i = 0; i < state.range_x(); ++i) {
+    volatile int dummy = i;
+    ((void)dummy);
+  }
+}
+BASIC_BENCHMARK_TEST(BM_spin_pause_after);
+BASIC_BENCHMARK_TEST(BM_spin_pause_after)->ThreadPerCpu();
+
+
+void BM_spin_pause_before_and_after(benchmark::State& state) {
+  for (int i = 0; i < state.range_x(); ++i) {
+    volatile int dummy = i;
+    ((void)dummy);
+  }
+  while(state.KeepRunning()) {
+    for (int i = 0; i < state.range_x(); ++i) {
+      volatile int dummy = i;
+      ((void)dummy);
+    }
+  }
+  for (int i = 0; i < state.range_x(); ++i) {
+    volatile int dummy = i;
+    ((void)dummy);
+  }
+}
+BASIC_BENCHMARK_TEST(BM_spin_pause_before_and_after);
+BASIC_BENCHMARK_TEST(BM_spin_pause_before_and_after)->ThreadPerCpu();
+
+
+void BM_empty_stop_start(benchmark::State& state) {
+  while (state.KeepRunning()) { }
+}
+BENCHMARK(BM_empty_stop_start);
+BENCHMARK(BM_empty_stop_start)->ThreadPerCpu();
+
+BENCHMARK_MAIN()
diff --git a/test/benchmark_test.cc b/test/benchmark_test.cc
index 2ce1001d..d44ea319 100644
--- a/test/benchmark_test.cc
+++ b/test/benchmark_test.cc
@@ -53,18 +53,22 @@ static void BM_Factorial(benchmark::State& state) {
   while (state.KeepRunning())
     fac_42 = Factorial(8);
   // Prevent compiler optimizations
-  std::cout << fac_42;
+  std::stringstream ss;
+  ss << fac_42;
+  state.SetLabel(ss.str());
 }
 BENCHMARK(BM_Factorial);
 
 static void BM_FactorialRealTime(benchmark::State& state) {
-  benchmark::UseRealTime();
+  state.UseRealTime();
 
   int fac_42 = 0;
   while (state.KeepRunning())
     fac_42 = Factorial(8);
   // Prevent compiler optimizations
-  std::cout << fac_42;
+  std::stringstream ss;
+  ss << fac_42;
+  state.SetLabel(ss.str());
 }
 BENCHMARK(BM_FactorialRealTime);
 
@@ -158,12 +162,5 @@ static void BM_LongTest(benchmark::State& state) {
 }
 BENCHMARK(BM_LongTest)->Range(1<<16,1<<28);
 
-int main(int argc, const char* argv[]) {
-  benchmark::Initialize(&argc, argv);
-
-  assert(Factorial(8) == 40320);
-  assert(CalculatePi(1) == 0.0);
-
-  benchmark::RunSpecifiedBenchmarks();
-}
+BENCHMARK_MAIN()