Make `PauseTiming()` and `ResumeTiming()` per thread. (#286)

* Change to using per-thread timers * fix bad assertions * fix copy paste error on windows * Fix thread safety annotations * Make null-log thread safe * remove remaining globals * use chrono for walltime since it is thread safe * consolidate timer functions * Add missing ctime include * Rename to be consistent with Google style * Format patch using clang-format * cleanup -Wthread-safety configuration * Don't trust _POSIX_FEATURE macros because OS X lies. * Fix OS X thread timings * attempt to fix mingw build * Attempt to make mingw work again * Revert old mingw workaround * improve diagnostics * Drastically improve OS X measurements * Use average real time instead of max
2016-09-02 21:34:34 -06:00 · 2016-09-02 21:34:34 -06:00 · cba945e37d
parent 94c2a30a3e
commit cba945e37d
19 changed files with 539 additions and 747 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -87,8 +87,7 @@ else()
    add_cxx_compiler_flag(-Wstrict-aliasing)
  endif()
  add_cxx_compiler_flag(-Wthread-safety)
-  if (HAVE_WTHREAD_SAFETY)
-    add_definitions(-DHAVE_WTHREAD_SAFETY)
+  if (HAVE_CXX_FLAG_WTHREAD_SAFETY)
    cxx_feature_check(THREAD_SAFETY_ATTRIBUTES)
  endif()

@ -152,7 +151,6 @@ cxx_feature_check(STD_REGEX)
 cxx_feature_check(GNU_POSIX_REGEX)
 cxx_feature_check(POSIX_REGEX)
 cxx_feature_check(STEADY_CLOCK)
-
 # Ensure we have pthreads
 find_package(Threads REQUIRED)

--- a/include/benchmark/benchmark_api.h
+++ b/include/benchmark/benchmark_api.h
@ -270,31 +270,25 @@ enum BigO {
 // computational complexity for the benchmark.
 typedef double(BigOFunc)(int);

+namespace internal {
+class ThreadTimer;
+class ThreadManager;
+}
+
 // State is passed to a running Benchmark and contains state for the
 // benchmark to use.
 class State {
 public:
-  State(size_t max_iters, const std::vector<int>& ranges,
-        int thread_i, int n_threads);
-
  // Returns true if the benchmark should continue through another iteration.
  // NOTE: A benchmark may not return from the test until KeepRunning() has
  // returned false.
  bool KeepRunning() {
    if (BENCHMARK_BUILTIN_EXPECT(!started_, false)) {
-      assert(!finished_);
-      started_ = true;
-      ResumeTiming();
+      StartKeepRunning();
    }
    bool const res = total_iterations_++ < max_iterations;
    if (BENCHMARK_BUILTIN_EXPECT(!res, false)) {
-      assert(started_ && (!finished_ || error_occurred_));
-      if (!error_occurred_) {
-        PauseTiming();
-      }
-      // Total iterations now is one greater than max iterations. Fix this.
-      total_iterations_ = max_iterations;
-      finished_ = true;
+      FinishKeepRunning();
    }
    return res;
  }
@ -304,10 +298,11 @@ public:
  // Stop the benchmark timer.  If not called, the timer will be
  // automatically stopped after KeepRunning() returns false for the first time.
  //
-  // For threaded benchmarks the PauseTiming() function acts
-  // like a barrier.  I.e., the ith call by a particular thread to this
-  // function will block until all active threads have made their ith call.
-  // The timer will stop when the last thread has called this function.
+  // For threaded benchmarks the PauseTiming() function only pauses the timing
+  // for the current thread.
+  //
+  // NOTE: The "real time" measurement is per-thread. If different threads
+  // report different measurements the largest one is reported.
  //
  // NOTE: PauseTiming()/ResumeTiming() are relatively
  // heavyweight, and so their use should generally be avoided
@ -319,11 +314,6 @@ public:
  // Start the benchmark timer.  The timer is NOT running on entrance to the
  // benchmark function. It begins running after the first call to KeepRunning()
  //
-  // For threaded benchmarks the ResumeTiming() function acts
-  // like a barrier.  I.e., the ith call by a particular thread to this
-  // function will block until all active threads have made their ith call.
-  // The timer will start when the last thread has called this function.
-  //
  // NOTE: PauseTiming()/ResumeTiming() are relatively
  // heavyweight, and so their use should generally be avoided
  // within each benchmark iteration, if possible.
@ -335,10 +325,10 @@ public:
  // thread and report an error with the specified 'msg'. After this call
  // the user may explicitly 'return' from the benchmark.
  //
-  // For threaded benchmarks only the current thread stops executing. If
-  // multiple threads report an error only the first error message is used.
-  // The current thread is no longer considered 'active' by
-  // 'PauseTiming()' and 'ResumingTiming()'.
+  // For threaded benchmarks only the current thread stops executing and future
+  // calls to `KeepRunning()` will block until all threads have completed
+  // the `KeepRunning()` loop. If multiple threads report an error only the
+  // first error message is used.
  //
  // NOTE: Calling 'SkipWithError(...)' does not cause the benchmark to exit
  // the current scope immediately. If the function is called from within
@ -351,10 +341,8 @@ public:
  // is used instead of automatically measured time if UseManualTime() was
  // specified.
  //
-  // For threaded benchmarks the SetIterationTime() function acts
-  // like a barrier.  I.e., the ith call by a particular thread to this
-  // function will block until all threads have made their ith call.
-  // The time will be set by the last thread to call this function.
+  // For threaded benchmarks the final value will be set to the largest
+  // reported values.
  void SetIterationTime(double seconds);

  // Set the number of bytes processed by the current benchmark
@ -465,7 +453,16 @@ public:
  const int threads;
  const size_t max_iterations;

-private:
+  // TODO make me private
+  State(size_t max_iters, const std::vector<int>& ranges, int thread_i,
+        int n_threads, internal::ThreadTimer* timer,
+        internal::ThreadManager* manager);
+
+ private:
+  void StartKeepRunning();
+  void FinishKeepRunning();
+  internal::ThreadTimer* timer_;
+  internal::ThreadManager* manager_;
  BENCHMARK_DISALLOW_COPY_AND_ASSIGN(State);
 };

--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -8,9 +8,9 @@ endif()

 # Define the source files
 set(SOURCE_FILES "benchmark.cc" "colorprint.cc" "commandlineflags.cc"
-                 "console_reporter.cc" "csv_reporter.cc" "json_reporter.cc"
-                 "log.cc" "reporter.cc" "sleep.cc" "string_util.cc"
-                 "sysinfo.cc" "walltime.cc" "complexity.cc")
+                 "console_reporter.cc" "csv_reporter.cc"
+                 "json_reporter.cc" "reporter.cc" "sleep.cc"
+                 "string_util.cc" "sysinfo.cc" "complexity.cc" "timers.cc")
 # Add headers to the list of source files. cmake does not require this,
 # but IDEs such as Visual Studio need this to add the headers
 # to the generated project.
@ -19,8 +19,7 @@ list(APPEND SOURCE_FILES "${_d}/benchmark.h" "${_d}/benchmark_api.h"
            "${_d}/macros.h" "${_d}/reporter.h" "arraysize.h" "check.h"
            "colorprint.h" "commandlineflags.h" "complexity.h"
            "cycleclock.h" "internal_macros.h" "log.h" "mutex.h"
-            "re.h" "sleep.h" "stat.h" "string_util.h" "sysinfo.h"
-            "walltime.h")
+            "re.h" "sleep.h" "stat.h" "string_util.h" "sysinfo.h" "timers.h")
 unset(_d)

 # Determine the correct regular expression engine to use
--- a/src/benchmark.cc
+++ b/src/benchmark.cc
@ -41,7 +41,7 @@
 #include "stat.h"
 #include "string_util.h"
 #include "sysinfo.h"
-#include "walltime.h"
+#include "timers.h"

 DEFINE_bool(benchmark_list_tests, false,
            "Print a list of benchmarks. This option overrides all other "
@ -92,20 +92,7 @@ namespace internal {

 void UseCharPointer(char const volatile*) {}

-// NOTE: This is a dummy "mutex" type used to denote the actual mutex
-// returned by GetBenchmarkLock(). This is only used to placate the thread
-// safety warnings by giving the return of GetBenchmarkLock() a name.
-struct CAPABILITY("mutex") BenchmarkLockType {};
-BenchmarkLockType BenchmarkLockVar;
-
-} // end namespace internal
-
-inline Mutex& RETURN_CAPABILITY(::benchmark::internal::BenchmarkLockVar)
-GetBenchmarkLock()
-{
-  static Mutex lock;
-  return lock;
-}
+}  // end namespace internal

 namespace {

@ -120,148 +107,111 @@ static const int kRangeMultiplier = 8;
 static const size_t kMaxFamilySize = 100;
 static const size_t kMaxIterations = 1000000000;

-bool running_benchmark = false;
+}  // end namespace

-// Global variable so that a benchmark can cause a little extra printing
-std::string* GetReportLabel() {
-    static std::string label GUARDED_BY(GetBenchmarkLock());
-    return &label;
-}
+namespace internal {

-// Global variable so that a benchmark can report an error as a human readable
-// string. If error_message is null no error occurred.
-#if defined(_MSC_VER) && _MSC_VER <= 1800
-typedef char* error_message_type;
-#else
-typedef const char* error_message_type;
-#endif
+// NOTE: This is a dummy "mutex" type used to denote the actual mutex
+// returned by GetBenchmarkMutex(). This is only used to placate the thread
+// safety warnings by giving the return of GetBenchmarkLock() a name.
+struct CAPABILITY("mutex") BenchmarkLockType {};
+BenchmarkLockType BenchmarkLockVar;

-static std::atomic<error_message_type> error_message = ATOMIC_VAR_INIT(nullptr);
+class ThreadManager {
+ public:
+  ThreadManager(int num_threads)
+      : alive_threads_(num_threads), start_stop_barrier_(num_threads) {}

-// TODO(ericwf): support MallocCounter.
-//static benchmark::MallocCounter *benchmark_mc;
+  Mutex& GetBenchmarkMutex() const
+      RETURN_CAPABILITY(::benchmark::internal::BenchmarkLockVar) {
+    return benchmark_mutex_;
+  }

-struct ThreadStats {
-    ThreadStats() : bytes_processed(0), items_processed(0), complexity_n(0) {}
-    int64_t bytes_processed;
-    int64_t items_processed;
-    int  complexity_n;
+  bool StartStopBarrier() EXCLUDES(end_cond_mutex_) {
+    return start_stop_barrier_.wait();
+  }
+
+  void NotifyThreadComplete() EXCLUDES(end_cond_mutex_) {
+    start_stop_barrier_.removeThread();
+    if (--alive_threads_ == 0) {
+      MutexLock lock(end_cond_mutex_);
+      end_condition_.notify_all();
+    }
+  }
+
+  void WaitForAllThreads() EXCLUDES(end_cond_mutex_) {
+    MutexLock lock(end_cond_mutex_);
+    end_condition_.wait(lock.native_handle(),
+                        [this]() { return alive_threads_ == 0; });
+  }
+
+ public:
+  GUARDED_BY(GetBenchmarkMutex()) double real_time_used = 0;
+  GUARDED_BY(GetBenchmarkMutex()) double cpu_time_used = 0;
+  GUARDED_BY(GetBenchmarkMutex()) double manual_time_used = 0;
+  GUARDED_BY(GetBenchmarkMutex()) int64_t bytes_processed = 0;
+  GUARDED_BY(GetBenchmarkMutex()) int64_t items_processed = 0;
+  GUARDED_BY(GetBenchmarkMutex()) int complexity_n = 0;
+  GUARDED_BY(GetBenchmarkMutex()) std::string report_label_;
+  GUARDED_BY(GetBenchmarkMutex()) std::string error_message_;
+  GUARDED_BY(GetBenchmarkMutex()) bool has_error_ = false;
+
+ private:
+  mutable Mutex benchmark_mutex_;
+  std::atomic<int> alive_threads_;
+  Barrier start_stop_barrier_;
+  Mutex end_cond_mutex_;
+  Condition end_condition_;
 };

 // Timer management class
-class TimerManager {
+class ThreadTimer {
 public:
-  TimerManager(int num_threads, Notification* done)
-      : num_threads_(num_threads),
-        running_threads_(num_threads),
-        done_(done),
-        running_(false),
+  ThreadTimer()
+      : running_(false),
        real_time_used_(0),
        cpu_time_used_(0),
-        manual_time_used_(0),
-        num_finalized_(0),
-        phase_number_(0),
-        entered_(0)
-  {
+        manual_time_used_(0) {}
+
+  // Called by each thread
+  void StartTimer() {
+    running_ = true;
+    start_real_time_ = ChronoClockNow();
+    start_cpu_time_ = ThreadCPUUsage();
  }

  // Called by each thread
-  void StartTimer() EXCLUDES(lock_) {
-    bool last_thread = false;
-    {
-      MutexLock ml(lock_);
-      last_thread = Barrier(ml);
-      if (last_thread) {
-        CHECK(!running_) << "Called StartTimer when timer is already running";
-        running_ = true;
-        start_real_time_ = walltime::Now();
-        start_cpu_time_ = MyCPUUsage() + ChildrenCPUUsage();
-       }
-     }
-     if (last_thread) {
-       phase_condition_.notify_all();
-     }
+  void StopTimer() {
+    CHECK(running_);
+    running_ = false;
+    real_time_used_ += ChronoClockNow() - start_real_time_;
+    cpu_time_used_ += ThreadCPUUsage() - start_cpu_time_;
  }

  // Called by each thread
-  void StopTimer() EXCLUDES(lock_) {
-    bool last_thread = false;
-    {
-      MutexLock ml(lock_);
-      last_thread = Barrier(ml);
-      if (last_thread) {
-        CHECK(running_) << "Called StopTimer when timer is already stopped";
-        InternalStop();
-      }
-    }
-    if (last_thread) {
-      phase_condition_.notify_all();
-    }
-  }
+  void SetIterationTime(double seconds) { manual_time_used_ += seconds; }

-  // Called by each thread
-  void SetIterationTime(double seconds) EXCLUDES(lock_) {
-    bool last_thread = false;
-    {
-      MutexLock ml(lock_);
-      last_thread = Barrier(ml);
-      if (last_thread) {
-        manual_time_used_ += seconds;
-      }
-    }
-    if (last_thread) {
-      phase_condition_.notify_all();
-    }
-  }
-
-  // Called by each thread
-  void Finalize() EXCLUDES(lock_) {
-    MutexLock l(lock_);
-    num_finalized_++;
-    if (num_finalized_ == num_threads_) {
-      CHECK(!running_) <<
-        "The timer should be stopped before the timer is finalized";
-      done_->Notify();
-    }
-  }
-
-  void RemoveErroredThread() EXCLUDES(lock_) {
-    MutexLock ml(lock_);
-    int last_thread = --running_threads_ == 0;
-    if (last_thread && running_)
-      InternalStop();
-    else if (!last_thread)
-      phase_condition_.notify_all();
-  }
+  bool running() const { return running_; }

  // REQUIRES: timer is not running
-  double real_time_used() EXCLUDES(lock_) {
-    MutexLock l(lock_);
+  double real_time_used() {
    CHECK(!running_);
    return real_time_used_;
  }

  // REQUIRES: timer is not running
-  double cpu_time_used() EXCLUDES(lock_) {
-    MutexLock l(lock_);
+  double cpu_time_used() {
    CHECK(!running_);
    return cpu_time_used_;
  }

  // REQUIRES: timer is not running
-  double manual_time_used() EXCLUDES(lock_) {
-    MutexLock l(lock_);
+  double manual_time_used() {
    CHECK(!running_);
    return manual_time_used_;
  }

 private:
-  Mutex lock_;
-  Condition phase_condition_;
-  int num_threads_;
-  int running_threads_;
-  Notification* done_;
-
  bool running_;                // Is the timer running
  double start_real_time_;      // If running_
  double start_cpu_time_;       // If running_
@ -272,52 +222,8 @@ class TimerManager {
  // Manually set iteration time. User sets this with SetIterationTime(seconds).
  double manual_time_used_;

-  // How many threads have called Finalize()
-  int num_finalized_;
-
-  // State for barrier management
-  int phase_number_;
-  int entered_;         // Number of threads that have entered this barrier
-
-  void InternalStop() REQUIRES(lock_) {
-    CHECK(running_);
-    running_ = false;
-    real_time_used_ += walltime::Now() - start_real_time_;
-    cpu_time_used_ += ((MyCPUUsage() + ChildrenCPUUsage())
-                       - start_cpu_time_);
-  }
-
-  // Enter the barrier and wait until all other threads have also
-  // entered the barrier.  Returns iff this is the last thread to
-  // enter the barrier.
-  bool Barrier(MutexLock& ml) REQUIRES(lock_) {
-    CHECK_LT(entered_, running_threads_);
-    entered_++;
-    if (entered_ < running_threads_) {
-      // Wait for all threads to enter
-      int phase_number_cp = phase_number_;
-      auto cb = [this, phase_number_cp]() {
-        return this->phase_number_ > phase_number_cp ||
-               entered_ == running_threads_; // A thread has aborted in error
-      };
-      phase_condition_.wait(ml.native_handle(), cb);
-      if (phase_number_ > phase_number_cp)
-        return false;
-      // else (running_threads_ == entered_) and we are the last thread.
-    }
-    // Last thread has reached the barrier
-    phase_number_++;
-    entered_ = 0;
-    return true;
-  }
 };

-// TimerManager for current run.
-static std::unique_ptr<TimerManager> timer_manager = nullptr;
-
-} // end namespace
-
-namespace internal {

 enum ReportMode : unsigned {
    RM_Unspecified, // The mode has not been manually specified
@ -802,32 +708,33 @@ namespace {
 // Adds the stats collected for the thread into *total.
 void RunInThread(const benchmark::internal::Benchmark::Instance* b,
                 size_t iters, int thread_id,
-                 ThreadStats* total) EXCLUDES(GetBenchmarkLock()) {
-  State st(iters, b->arg, thread_id, b->threads);
+                 internal::ThreadManager* manager) {
+  internal::ThreadTimer timer;
+  State st(iters, b->arg, thread_id, b->threads, &timer, manager);
  b->benchmark->Run(st);
  CHECK(st.iterations() == st.max_iterations) <<
    "Benchmark returned before State::KeepRunning() returned false!";
  {
-    MutexLock l(GetBenchmarkLock());
-    total->bytes_processed += st.bytes_processed();
-    total->items_processed += st.items_processed();
-    total->complexity_n += st.complexity_length_n();
+    MutexLock l(manager->GetBenchmarkMutex());
+    manager->cpu_time_used += timer.cpu_time_used();
+    manager->real_time_used += timer.real_time_used();
+    manager->manual_time_used += timer.manual_time_used();
+    manager->bytes_processed += st.bytes_processed();
+    manager->items_processed += st.items_processed();
+    manager->complexity_n += st.complexity_length_n();
  }
-
-  timer_manager->Finalize();
+  manager->NotifyThreadComplete();
 }

-std::vector<BenchmarkReporter::Run>
-RunBenchmark(const benchmark::internal::Benchmark::Instance& b,
-             std::vector<BenchmarkReporter::Run>* complexity_reports)
-  EXCLUDES(GetBenchmarkLock()) {
+std::vector<BenchmarkReporter::Run> RunBenchmark(
+    const benchmark::internal::Benchmark::Instance& b,
+    std::vector<BenchmarkReporter::Run>* complexity_reports) {
  std::vector<BenchmarkReporter::Run> reports; // return value

  size_t iters = 1;
-
+  const int num_threads = b.multithreaded ? b.threads : 1;
  std::vector<std::thread> pool;
-  if (b.multithreaded)
-    pool.resize(b.threads);
+  if (num_threads > 1) pool.resize(num_threads -1);

  const int repeats = b.repetitions != 0 ? b.repetitions
                                         : FLAGS_benchmark_repetitions;
@ -841,17 +748,7 @@ RunBenchmark(const benchmark::internal::Benchmark::Instance& b,
      // Try benchmark
      VLOG(2) << "Running " << b.name << " for " << iters << "\n";

-      {
-        MutexLock l(GetBenchmarkLock());
-        GetReportLabel()->clear();
-      }
-      error_message = nullptr;
-
-      Notification done;
-      timer_manager = std::unique_ptr<TimerManager>(new TimerManager(b.threads, &done));
-
-      ThreadStats total;
-      running_benchmark = true;
+      internal::ThreadManager manager(num_threads);
      if (b.multithreaded) {
        // If this is out first iteration of the while(true) loop then the
        // threads haven't been started and can't be joined. Otherwise we need
@ -861,19 +758,17 @@ RunBenchmark(const benchmark::internal::Benchmark::Instance& b,
            thread.join();
        }
        for (std::size_t ti = 0; ti < pool.size(); ++ti) {
-            pool[ti] = std::thread(&RunInThread, &b, iters, static_cast<int>(ti), &total);
+          pool[ti] = std::thread(&RunInThread, &b, iters,
+                                 static_cast<int>(ti + 1), &manager);
        }
-      } else {
-        // Run directly in this thread
-        RunInThread(&b, iters, 0, &total);
      }
-      done.WaitForNotification();
-      running_benchmark = false;
+      RunInThread(&b, iters, 0, &manager);
+      manager.WaitForAllThreads();
+      MutexLock l(manager.GetBenchmarkMutex());

-      const double cpu_accumulated_time = timer_manager->cpu_time_used();
-      const double real_accumulated_time = timer_manager->real_time_used();
-      const double manual_accumulated_time = timer_manager->manual_time_used();
-      timer_manager.reset();
+      const double cpu_accumulated_time = manager.cpu_time_used;
+      const double real_accumulated_time = manager.real_time_used / num_threads;
+      const double manual_accumulated_time = manager.manual_time_used / num_threads;

      VLOG(2) << "Ran in " << cpu_accumulated_time << "/"
              << real_accumulated_time << "\n";
@ -886,41 +781,30 @@ RunBenchmark(const benchmark::internal::Benchmark::Instance& b,
          seconds = real_accumulated_time;
      }

-      std::string label;
-      {
-        MutexLock l(GetBenchmarkLock());
-        label = *GetReportLabel();
-      }
-      error_message_type error_msg = error_message;
-
      const double min_time = !IsZero(b.min_time) ? b.min_time
                                                  : FLAGS_benchmark_min_time;
-
      // If this was the first run, was elapsed time or cpu time large enough?
      // If this is not the first run, go with the current value of iter.
-      if ((i > 0) || (error_msg != nullptr) ||
-          (iters >= kMaxIterations) ||
-          (seconds >= min_time) ||
-          (real_accumulated_time >= 5*min_time)) {
-
+      if ((i > 0) || manager.has_error_ || (iters >= kMaxIterations) ||
+          (seconds >= min_time) || (real_accumulated_time >= 5 * min_time)) {
        // Create report about this benchmark run.
        BenchmarkReporter::Run report;
        report.benchmark_name = b.name;
-        report.error_occurred = error_msg != nullptr;
-        report.error_message = error_msg != nullptr ? error_msg : "";
-        report.report_label = label;
+        report.error_occurred = manager.has_error_;
+        report.error_message = manager.error_message_;
+        report.report_label = manager.report_label_;
        // Report the total iterations across all threads.
        report.iterations = static_cast<int64_t>(iters) * b.threads;
        report.time_unit = b.time_unit;

        if (!report.error_occurred) {
          double bytes_per_second = 0;
-          if (total.bytes_processed > 0 && seconds > 0.0) {
-            bytes_per_second = (total.bytes_processed / seconds);
+          if (manager.bytes_processed > 0 && seconds > 0.0) {
+            bytes_per_second = (manager.bytes_processed / seconds);
          }
          double items_per_second = 0;
-          if (total.items_processed > 0 && seconds > 0.0) {
-            items_per_second = (total.items_processed / seconds);
+          if (manager.items_processed > 0 && seconds > 0.0) {
+            items_per_second = (manager.items_processed / seconds);
          }

          if (b.use_manual_time) {
@ -931,7 +815,7 @@ RunBenchmark(const benchmark::internal::Benchmark::Instance& b,
          report.cpu_accumulated_time = cpu_accumulated_time;
          report.bytes_per_second = bytes_per_second;
          report.items_per_second = items_per_second;
-          report.complexity_n = total.complexity_n;
+          report.complexity_n = manager.complexity_n;
          report.complexity = b.complexity;
          report.complexity_lambda = b.complexity_lambda;
          if(report.complexity != oNone)
@ -981,55 +865,77 @@ RunBenchmark(const benchmark::internal::Benchmark::Instance& b,

 }  // namespace

-State::State(size_t max_iters, const std::vector<int>& ranges,
-             int thread_i, int n_threads)
-    : started_(false), finished_(false), total_iterations_(0),
+State::State(size_t max_iters, const std::vector<int>& ranges, int thread_i,
+             int n_threads, internal::ThreadTimer* timer,
+             internal::ThreadManager* manager)
+    : started_(false),
+      finished_(false),
+      total_iterations_(0),
      range_(ranges),
-      bytes_processed_(0), items_processed_(0),
+      bytes_processed_(0),
+      items_processed_(0),
      complexity_n_(0),
      error_occurred_(false),
      thread_index(thread_i),
      threads(n_threads),
-      max_iterations(max_iters)
-{
-    CHECK(max_iterations != 0) << "At least one iteration must be run";
-    CHECK_LT(thread_index, threads) << "thread_index must be less than threads";
+      max_iterations(max_iters),
+      timer_(timer),
+      manager_(manager) {
+  CHECK(max_iterations != 0) << "At least one iteration must be run";
+  CHECK_LT(thread_index, threads) << "thread_index must be less than threads";
 }

 void State::PauseTiming() {
  // Add in time accumulated so far
-  CHECK(running_benchmark);
  CHECK(started_ && !finished_ && !error_occurred_);
-  timer_manager->StopTimer();
+  timer_->StopTimer();
 }

 void State::ResumeTiming() {
-  CHECK(running_benchmark);
  CHECK(started_ && !finished_ && !error_occurred_);
-  timer_manager->StartTimer();
+  timer_->StartTimer();
 }

 void State::SkipWithError(const char* msg) {
  CHECK(msg);
  error_occurred_ = true;
-  error_message_type expected_no_error_msg = nullptr;
-  error_message.compare_exchange_weak(expected_no_error_msg,
-    const_cast<error_message_type>(msg));
-  started_ = finished_ = true;
+  {
+    MutexLock l(manager_->GetBenchmarkMutex());
+    if (manager_->has_error_ == false) {
+      manager_->error_message_ = msg;
+      manager_->has_error_ = true;
+    }
+  }
  total_iterations_ = max_iterations;
-  timer_manager->RemoveErroredThread();
+  if (timer_->running()) timer_->StopTimer();
 }

 void State::SetIterationTime(double seconds)
 {
-  CHECK(running_benchmark);
-  timer_manager->SetIterationTime(seconds);
+  timer_->SetIterationTime(seconds);
 }

 void State::SetLabel(const char* label) {
-  CHECK(running_benchmark);
-  MutexLock l(GetBenchmarkLock());
-  *GetReportLabel() = label;
+  MutexLock l(manager_->GetBenchmarkMutex());
+  manager_->report_label_ = label;
+}
+
+void State::StartKeepRunning() {
+  CHECK(!started_ && !finished_);
+  started_ = true;
+  manager_->StartStopBarrier();
+  if (!error_occurred_) ResumeTiming();
+}
+
+void State::FinishKeepRunning() {
+  CHECK(started_ && (!finished_ || error_occurred_));
+  if (!error_occurred_) {
+    PauseTiming();
+  }
+  // Total iterations now is one greater than max iterations. Fix this.
+  total_iterations_ = max_iterations;
+  finished_ = true;
+  manager_->StartStopBarrier();
 }

 namespace internal {
@ -1231,13 +1137,7 @@ int InitializeStreams() {

 void Initialize(int* argc, char** argv) {
  internal::ParseCommandLineFlags(argc, argv);
-  internal::SetLogLevel(FLAGS_v);
-  // TODO remove this. It prints some output the first time it is called.
-  // We don't want to have this ouput printed during benchmarking.
-  MyCPUUsage();
-  // The first call to walltime::Now initialized it. Call it once to
-  // prevent the initialization from happening in a benchmark.
-  walltime::Now();
+  internal::LogLevel() = FLAGS_v;
 }

 } // end namespace benchmark
--- a/src/check.h
+++ b/src/check.h
@ -33,9 +33,7 @@ public:
          << check << "' failed. ";
  }

-  std::ostream& GetLog() {
-    return log_;
-  }
+  LogType& GetLog() { return log_; }

  BENCHMARK_NORETURN ~CheckHandler() BENCHMARK_NOEXCEPT_OP(false) {
      log_ << std::endl;
@ -46,7 +44,7 @@ public:
  CheckHandler(const CheckHandler&) = delete;
  CheckHandler() = delete;
 private:
-  std::ostream& log_;
+ LogType& log_;
 };

 } // end namespace internal
--- a/src/console_reporter.cc
+++ b/src/console_reporter.cc
@ -28,7 +28,7 @@
 #include "commandlineflags.h"
 #include "internal_macros.h"
 #include "string_util.h"
-#include "walltime.h"
+#include "timers.h"

 namespace benchmark {

--- a/src/csv_reporter.cc
+++ b/src/csv_reporter.cc
@ -23,7 +23,7 @@
 #include <vector>

 #include "string_util.h"
-#include "walltime.h"
+#include "timers.h"

 // File format reference: http://edoceo.com/utilitas/csv-file-format.

--- a/src/json_reporter.cc
+++ b/src/json_reporter.cc
@ -23,7 +23,7 @@
 #include <vector>

 #include "string_util.h"
-#include "walltime.h"
+#include "timers.h"

 namespace benchmark {

--- a/src/log.cc
+++ b/src/log.cc
@ -1,40 +0,0 @@
-#include "log.h"
-
-#include <iostream>
-
-namespace benchmark {
-namespace internal {
-
-int& LoggingLevelImp() {
-    static int level = 0;
-    return level;
-}
-
-void SetLogLevel(int value) {
-    LoggingLevelImp() = value;
-}
-
-int GetLogLevel() {
-    return LoggingLevelImp();
-}
-
-class NullLogBuffer : public std::streambuf
-{
-public:
-  int overflow(int c) {
-    return c;
-  }
-};
-
-std::ostream& GetNullLogInstance() {
-  static NullLogBuffer log_buff;
-  static std::ostream null_log(&log_buff);
-  return null_log;
-}
-
-std::ostream& GetErrorLogInstance() {
-  return std::clog;
-}
-
-} // end namespace internal
-} // end namespace benchmark
--- a/src/log.h
+++ b/src/log.h
@ -1,19 +1,63 @@
 #ifndef BENCHMARK_LOG_H_
 #define BENCHMARK_LOG_H_

+#include <iostream>
 #include <ostream>

+#include "benchmark/macros.h"
+
 namespace benchmark {
 namespace internal {

-int GetLogLevel();
-void SetLogLevel(int level);
+typedef std::basic_ostream<char>&(EndLType)(std::basic_ostream<char>&);

-std::ostream& GetNullLogInstance();
-std::ostream& GetErrorLogInstance();
+class LogType {
+  friend LogType& GetNullLogInstance();
+  friend LogType& GetErrorLogInstance();

-inline std::ostream& GetLogInstanceForLevel(int level) {
-  if (level <= GetLogLevel()) {
+  // FIXME: Add locking to output.
+  template <class Tp>
+  friend LogType& operator<<(LogType&, Tp const&);
+  friend LogType& operator<<(LogType&, EndLType*);
+
+ private:
+  LogType(std::ostream* out) : out_(out) {}
+  std::ostream* out_;
+  BENCHMARK_DISALLOW_COPY_AND_ASSIGN(LogType);
+};
+
+template <class Tp>
+LogType& operator<<(LogType& log, Tp const& value) {
+  if (log.out_) {
+    *log.out_ << value;
+  }
+  return log;
+}
+
+inline LogType& operator<<(LogType& log, EndLType* m) {
+  if (log.out_) {
+    *log.out_ << m;
+  }
+  return log;
+}
+
+inline int& LogLevel() {
+  static int log_level = 0;
+  return log_level;
+}
+
+inline LogType& GetNullLogInstance() {
+  static LogType log(nullptr);
+  return log;
+}
+
+inline LogType& GetErrorLogInstance() {
+  static LogType log(&std::clog);
+  return log;
+}
+
+inline LogType& GetLogInstanceForLevel(int level) {
+  if (level <= LogLevel()) {
    return GetErrorLogInstance();
  }
  return GetNullLogInstance();
--- a/src/mutex.h
+++ b/src/mutex.h
@ -4,6 +4,8 @@
 #include <mutex>
 #include <condition_variable>

+#include "check.h"
+
 // Enable thread safety attributes only with clang.
 // The attributes can be safely erased when compiling with other compilers.
 #if defined(HAVE_THREAD_SAFETY_ATTRIBUTES)
@ -105,36 +107,58 @@ private:
  MutexLockImp ml_;
 };

+class Barrier {
+ public:
+  Barrier(int num_threads) : running_threads_(num_threads) {}

-class Notification
-{
-public:
-  Notification() : notified_yet_(false) { }
-
-  void WaitForNotification() const EXCLUDES(mutex_) {
-    MutexLock m_lock(mutex_);
-    auto notified_fn = [this]() REQUIRES(mutex_) {
-                            return this->HasBeenNotified();
-                        };
-    cv_.wait(m_lock.native_handle(), notified_fn);
-  }
-
-  void Notify() EXCLUDES(mutex_) {
+  // Called by each thread
+  bool wait() EXCLUDES(lock_) {
+    bool last_thread = false;
    {
-      MutexLock lock(mutex_);
-      notified_yet_ = 1;
+      MutexLock ml(lock_);
+      last_thread = createBarrier(ml);
    }
-    cv_.notify_all();
+    if (last_thread) phase_condition_.notify_all();
+    return last_thread;
  }

-private:
-  bool HasBeenNotified() const REQUIRES(mutex_) {
-    return notified_yet_;
+  void removeThread() EXCLUDES(lock_) {
+    MutexLock ml(lock_);
+    --running_threads_;
+    if (entered_ != 0) phase_condition_.notify_all();
  }

-  mutable Mutex mutex_;
-  mutable std::condition_variable cv_;
-  bool notified_yet_ GUARDED_BY(mutex_);
+ private:
+  Mutex lock_;
+  Condition phase_condition_;
+  int running_threads_;
+
+  // State for barrier management
+  int phase_number_ = 0;
+  int entered_ = 0;  // Number of threads that have entered this barrier
+
+  // Enter the barrier and wait until all other threads have also
+  // entered the barrier.  Returns iff this is the last thread to
+  // enter the barrier.
+  bool createBarrier(MutexLock& ml) REQUIRES(lock_) {
+    CHECK_LT(entered_, running_threads_);
+    entered_++;
+    if (entered_ < running_threads_) {
+      // Wait for all threads to enter
+      int phase_number_cp = phase_number_;
+      auto cb = [this, phase_number_cp]() {
+        return this->phase_number_ > phase_number_cp ||
+               entered_ == running_threads_;  // A thread has aborted in error
+      };
+      phase_condition_.wait(ml.native_handle(), cb);
+      if (phase_number_ > phase_number_cp) return false;
+      // else (running_threads_ == entered_) and we are the last thread.
+    }
+    // Last thread has reached the barrier
+    phase_number_++;
+    entered_ = 0;
+    return true;
+  }
 };

 } // end namespace benchmark
--- a/src/reporter.cc
+++ b/src/reporter.cc
@ -13,7 +13,7 @@
 // limitations under the License.

 #include "benchmark/reporter.h"
-#include "walltime.h"
+#include "timers.h"

 #include <cstdlib>

--- a/src/sysinfo.cc
+++ b/src/sysinfo.cc
@ -52,7 +52,6 @@ namespace {
 std::once_flag cpuinfo_init;
 double cpuinfo_cycles_per_second = 1.0;
 int cpuinfo_num_cpus = 1;  // Conservative guess
-std::mutex cputimens_mutex;

 #if !defined BENCHMARK_OS_MACOSX
 const int64_t estimate_time_ms = 1000;
@ -288,102 +287,9 @@ void InitializeSystemInfo() {
  cpuinfo_cycles_per_second = static_cast<double>(EstimateCyclesPerSecond());
 #endif
 }
+
 }  // end namespace

-// getrusage() based implementation of MyCPUUsage
-static double MyCPUUsageRUsage() {
-#ifndef BENCHMARK_OS_WINDOWS
-  struct rusage ru;
-  if (getrusage(RUSAGE_SELF, &ru) == 0) {
-    return (static_cast<double>(ru.ru_utime.tv_sec) +
-            static_cast<double>(ru.ru_utime.tv_usec) * 1e-6 +
-            static_cast<double>(ru.ru_stime.tv_sec) +
-            static_cast<double>(ru.ru_stime.tv_usec) * 1e-6);
-  } else {
-    return 0.0;
-  }
-#else
-  HANDLE proc = GetCurrentProcess();
-  FILETIME creation_time;
-  FILETIME exit_time;
-  FILETIME kernel_time;
-  FILETIME user_time;
-  ULARGE_INTEGER kernel;
-  ULARGE_INTEGER user;
-  GetProcessTimes(proc, &creation_time, &exit_time, &kernel_time, &user_time);
-  kernel.HighPart = kernel_time.dwHighDateTime;
-  kernel.LowPart = kernel_time.dwLowDateTime;
-  user.HighPart = user_time.dwHighDateTime;
-  user.LowPart = user_time.dwLowDateTime;
-  return (static_cast<double>(kernel.QuadPart) +
-          static_cast<double>(user.QuadPart)) * 1e-7;
-#endif  // OS_WINDOWS
-}
-
-#ifndef BENCHMARK_OS_WINDOWS
-static bool MyCPUUsageCPUTimeNsLocked(double* cputime) {
-  static int cputime_fd = -1;
-  if (cputime_fd == -1) {
-    cputime_fd = open("/proc/self/cputime_ns", O_RDONLY);
-    if (cputime_fd < 0) {
-      cputime_fd = -1;
-      return false;
-    }
-  }
-  char buff[64];
-  memset(buff, 0, sizeof(buff));
-  if (pread(cputime_fd, buff, sizeof(buff) - 1, 0) <= 0) {
-    close(cputime_fd);
-    cputime_fd = -1;
-    return false;
-  }
-  unsigned long long result = strtoull(buff, nullptr, 0);
-  if (result == (std::numeric_limits<unsigned long long>::max)()) {
-    close(cputime_fd);
-    cputime_fd = -1;
-    return false;
-  }
-  *cputime = static_cast<double>(result) / 1e9;
-  return true;
-}
-#endif  // OS_WINDOWS
-
-double MyCPUUsage() {
-#ifndef BENCHMARK_OS_WINDOWS
-  {
-    std::lock_guard<std::mutex> l(cputimens_mutex);
-    static bool use_cputime_ns = true;
-    if (use_cputime_ns) {
-      double value;
-      if (MyCPUUsageCPUTimeNsLocked(&value)) {
-        return value;
-      }
-      // Once MyCPUUsageCPUTimeNsLocked fails once fall back to getrusage().
-      VLOG(1) << "Reading /proc/self/cputime_ns failed. Using getrusage().\n";
-      use_cputime_ns = false;
-    }
-  }
-#endif  // OS_WINDOWS
-  return MyCPUUsageRUsage();
-}
-
-double ChildrenCPUUsage() {
-#ifndef BENCHMARK_OS_WINDOWS
-  struct rusage ru;
-  if (getrusage(RUSAGE_CHILDREN, &ru) == 0) {
-    return (static_cast<double>(ru.ru_utime.tv_sec) +
-            static_cast<double>(ru.ru_utime.tv_usec) * 1e-6 +
-            static_cast<double>(ru.ru_stime.tv_sec) +
-            static_cast<double>(ru.ru_stime.tv_usec) * 1e-6);
-  } else {
-    return 0.0;
-  }
-#else
-  // TODO: Not sure what this even means on Windows
-  return 0.0;
-#endif  // OS_WINDOWS
-}
-
 double CyclesPerSecond(void) {
  std::call_once(cpuinfo_init, InitializeSystemInfo);
  return cpuinfo_cycles_per_second;
--- a/src/sysinfo.h
+++ b/src/sysinfo.h
@ -2,8 +2,6 @@
 #define BENCHMARK_SYSINFO_H_

 namespace benchmark {
-double MyCPUUsage();
-double ChildrenCPUUsage();
 int NumCPUs();
 double CyclesPerSecond();
 bool CpuScalingEnabled();
--- a/src/timers.cc
+++ b/src/timers.cc
@ -0,0 +1,195 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "timers.h"
+#include "internal_macros.h"
+
+#ifdef BENCHMARK_OS_WINDOWS
+#include <Shlwapi.h>
+#include <VersionHelpers.h>
+#include <Windows.h>
+#else
+#include <fcntl.h>
+#include <sys/resource.h>
+#include <sys/time.h>
+#include <sys/types.h>  // this header must be included before 'sys/sysctl.h' to avoid compilation error on FreeBSD
+#include <unistd.h>
+#if defined BENCHMARK_OS_FREEBSD || defined BENCHMARK_OS_MACOSX
+#include <sys/sysctl.h>
+#endif
+#if defined(BENCHMARK_OS_MACOSX)
+#include <mach/mach_init.h>
+#include <mach/mach_port.h>
+#include <mach/thread_act.h>
+#endif
+#endif
+
+#include <cerrno>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <ctime>
+#include <iostream>
+#include <limits>
+#include <mutex>
+
+#include "check.h"
+#include "log.h"
+#include "sleep.h"
+#include "string_util.h"
+
+namespace benchmark {
+
+// Suppress unused warnings on helper functions.
+#if defined(__GNUC__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-function"
+#endif
+
+namespace {
+#if defined(BENCHMARK_OS_WINDOWS)
+double MakeTime(FILETIME const& kernel_time, FILETIME const& user_time) {
+  ULARGE_INTEGER kernel;
+  ULARGE_INTEGER user;
+  kernel.HighPart = kernel_time.dwHighDateTime;
+  kernel.LowPart = kernel_time.dwLowDateTime;
+  user.HighPart = user_time.dwHighDateTime;
+  user.LowPart = user_time.dwLowDateTime;
+  return (static_cast<double>(kernel.QuadPart) +
+          static_cast<double>(user.QuadPart)) *
+         1e-7;
+}
+#else
+double MakeTime(struct timespec const& ts) {
+  return ts.tv_sec + (static_cast<double>(ts.tv_nsec) * 1e-9);
+}
+double MakeTime(struct rusage ru) {
+  return (static_cast<double>(ru.ru_utime.tv_sec) +
+          static_cast<double>(ru.ru_utime.tv_usec) * 1e-6 +
+          static_cast<double>(ru.ru_stime.tv_sec) +
+          static_cast<double>(ru.ru_stime.tv_usec) * 1e-6);
+}
+#endif
+#if defined(BENCHMARK_OS_MACOSX)
+double MakeTime(thread_basic_info_data_t const& info) {
+  return (static_cast<double>(info.user_time.seconds) +
+          static_cast<double>(info.user_time.microseconds) * 1e-6 +
+          static_cast<double>(info.system_time.seconds) +
+          static_cast<double>(info.user_time.microseconds) * 1e-6);
+}
+#endif
+
+BENCHMARK_NORETURN static void  DiagnoseAndExit(const char* msg) {
+    std::cerr << "ERROR: " << msg << std::endl;
+    std::exit(EXIT_FAILURE);
+}
+
+}  // end namespace
+
+#if defined(__GNUC__)
+#pragma GCC diagnostic pop
+#endif
+
+double ProcessCPUUsage() {
+#if defined(BENCHMARK_OS_WINDOWS)
+  HANDLE proc = GetCurrentProcess();
+  FILETIME creation_time;
+  FILETIME exit_time;
+  FILETIME kernel_time;
+  FILETIME user_time;
+  if (GetProcessTimes(proc, &creation_time, &exit_time, &kernel_time, &user_time))
+    return MakeTime(kernel_time, user_time);
+  DiagnoseAndExit("GetProccessTimes() failed");
+#elif defined(CLOCK_PROCESS_CPUTIME_ID)
+  struct timespec spec;
+  if (clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &spec) == 0)
+    return MakeTime(spec);
+  DiagnoseAndExit("clock_gettime(CLOCK_PROCESS_CPUTIME_ID, ...) failed");
+#else
+  struct rusage ru;
+  if (getrusage(RUSAGE_SELF, &ru) == 0)
+    return MakeTime(ru);
+  DiagnoseAndExit("clock_gettime(CLOCK_PROCESS_CPUTIME_ID, ...) failed");
+#endif
+}
+
+double ThreadCPUUsage() {
+#if defined(BENCHMARK_OS_WINDOWS)
+  HANDLE this_thread = GetCurrentThread();
+  FILETIME creation_time;
+  FILETIME exit_time;
+  FILETIME kernel_time;
+  FILETIME user_time;
+  GetThreadTimes(this_thread, &creation_time, &exit_time, &kernel_time,
+                 &user_time);
+  return MakeTime(kernel_time, user_time);
+#elif defined(CLOCK_THREAD_CPUTIME_ID)
+  struct timespec ts;
+  if (clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts) == 0)
+    return MakeTime(ts);
+  DiagnoseAndExit("clock_gettime(CLOCK_THREAD_CPUTIME_ID, ...) failed");
+#elif defined(BENCHMARK_OS_MACOSX)
+  mach_msg_type_number_t count = THREAD_BASIC_INFO_COUNT;
+  thread_basic_info_data_t info;
+  mach_port_t thread = pthread_mach_thread_np(pthread_self());
+  if (thread_info(thread, THREAD_BASIC_INFO, (thread_info_t) &info, &count)
+      == KERN_SUCCESS) {
+    return MakeTime(info);
+  }
+  DiagnoseAndExit("ThreadCPUUsage() failed when evaluating thread_info");
+#else
+#error Per-thread timing is not available on your system.
+#endif
+}
+
+namespace {
+
+std::string DateTimeString(bool local) {
+  typedef std::chrono::system_clock Clock;
+  std::time_t now = Clock::to_time_t(Clock::now());
+  const std::size_t kStorageSize = 128;
+  char storage[kStorageSize];
+  std::size_t written;
+
+  if (local) {
+#if defined(BENCHMARK_OS_WINDOWS)
+    written =
+        std::strftime(storage, sizeof(storage), "%x %X", ::localtime(&now));
+#else
+    std::tm timeinfo;
+    std::memset(&timeinfo, 0, sizeof(std::tm));
+    ::localtime_r(&now, &timeinfo);
+    written = std::strftime(storage, sizeof(storage), "%F %T", &timeinfo);
+#endif
+  } else {
+#if defined(BENCHMARK_OS_WINDOWS)
+    written = std::strftime(storage, sizeof(storage), "%x %X", ::gmtime(&now));
+#else
+    std::tm timeinfo;
+    std::memset(&timeinfo, 0, sizeof(std::tm));
+    ::gmtime_r(&now, &timeinfo);
+    written = std::strftime(storage, sizeof(storage), "%F %T", &timeinfo);
+#endif
+  }
+  CHECK(written < kStorageSize);
+  ((void)written);  // prevent unused variable in optimized mode.
+  return std::string(storage);
+}
+
+}  // end namespace
+
+std::string LocalDateTimeString() { return DateTimeString(true); }
+
+}  // end namespace benchmark
--- a/src/timers.h
+++ b/src/timers.h
@ -0,0 +1,48 @@
+#ifndef BENCHMARK_TIMERS_H
+#define BENCHMARK_TIMERS_H
+
+#include <chrono>
+#include <string>
+
+namespace benchmark {
+
+// Return the CPU usage of the current process
+double ProcessCPUUsage();
+
+// Return the CPU usage of the children of the current process
+double ChildrenCPUUsage();
+
+// Return the CPU usage of the current thread
+double ThreadCPUUsage();
+
+#if defined(HAVE_STEADY_CLOCK)
+template <bool HighResIsSteady = std::chrono::high_resolution_clock::is_steady>
+struct ChooseSteadyClock {
+  typedef std::chrono::high_resolution_clock type;
+};
+
+template <>
+struct ChooseSteadyClock<false> {
+  typedef std::chrono::steady_clock type;
+};
+#endif
+
+struct ChooseClockType {
+#if defined(HAVE_STEADY_CLOCK)
+  typedef ChooseSteadyClock<>::type type;
+#else
+  typedef std::chrono::high_resolution_clock type;
+#endif
+};
+
+inline double ChronoClockNow() {
+  typedef ChooseClockType::type ClockType;
+  using FpSeconds = std::chrono::duration<double, std::chrono::seconds::period>;
+  return FpSeconds(ClockType::now().time_since_epoch()).count();
+}
+
+std::string LocalDateTimeString();
+
+}  // end namespace benchmark
+
+#endif  // BENCHMARK_TIMERS_H
--- a/src/walltime.cc
+++ b/src/walltime.cc
@ -1,263 +0,0 @@
-// Copyright 2015 Google Inc. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "benchmark/macros.h"
-#include "internal_macros.h"
-#include "walltime.h"
-
-#if defined(BENCHMARK_OS_WINDOWS)
-#include <time.h>
-#include <winsock.h> // for timeval
-#else
-#include <sys/time.h>
-#endif
-
-#include <cstdio>
-#include <cstdint>
-#include <cstring>
-#include <ctime>
-
-#include <atomic>
-#include <chrono>
-#include <limits>
-
-#include "arraysize.h"
-#include "check.h"
-#include "cycleclock.h"
-#include "log.h"
-#include "sysinfo.h"
-
-namespace benchmark {
-namespace walltime {
-
-namespace {
-
-#if defined(HAVE_STEADY_CLOCK)
-template <bool HighResIsSteady = std::chrono::high_resolution_clock::is_steady>
-struct ChooseSteadyClock {
-    typedef std::chrono::high_resolution_clock type;
-};
-
-template <>
-struct ChooseSteadyClock<false> {
-    typedef std::chrono::steady_clock type;
-};
-#endif
-
-struct ChooseClockType {
-#if defined(HAVE_STEADY_CLOCK)
-  typedef ChooseSteadyClock<>::type type;
-#else
-  typedef std::chrono::high_resolution_clock type;
-#endif
-};
-
-class WallTimeImp
-{
-public:
-  WallTime Now();
-
-  static WallTimeImp& GetWallTimeImp() {
-    static WallTimeImp* imp = new WallTimeImp();
-    return *imp;
-  }
-
-private:
-  WallTimeImp();
-  // Helper routines to load/store a float from an AtomicWord. Required because
-  // g++ < 4.7 doesn't support std::atomic<float> correctly. I cannot wait to
-  // get rid of this horror show.
-  void SetDrift(float f) {
-    int32_t w;
-    memcpy(&w, &f, sizeof(f));
-    std::atomic_store(&drift_adjust_, w);
-  }
-
-  float GetDrift() const {
-    float f;
-    int32_t w = std::atomic_load(&drift_adjust_);
-    memcpy(&f, &w, sizeof(f));
-    return f;
-  }
-
-  WallTime Slow() const {
-    struct timeval tv;
-#if defined(BENCHMARK_OS_WINDOWS)
-    FILETIME    file_time;
-    SYSTEMTIME  system_time;
-    ULARGE_INTEGER ularge;
-    const unsigned __int64 epoch = 116444736000000000LL;
-
-    GetSystemTime(&system_time);
-    SystemTimeToFileTime(&system_time, &file_time);
-    ularge.LowPart = file_time.dwLowDateTime;
-    ularge.HighPart = file_time.dwHighDateTime;
-
-    tv.tv_sec = (long)((ularge.QuadPart - epoch) / (10L * 1000 * 1000));
-    tv.tv_usec = (long)(system_time.wMilliseconds * 1000);
-#else
-    gettimeofday(&tv, nullptr);
-#endif
-    return tv.tv_sec + tv.tv_usec * 1e-6;
-  }
-
-private:
-  static_assert(sizeof(float) <= sizeof(int32_t),
-               "type sizes don't allow the drift_adjust hack");
-
-  WallTime base_walltime_;
-  int64_t base_cycletime_;
-  int64_t cycles_per_second_;
-  double seconds_per_cycle_;
-  uint32_t last_adjust_time_;
-  std::atomic<int32_t> drift_adjust_;
-  int64_t max_interval_cycles_;
-
-  BENCHMARK_DISALLOW_COPY_AND_ASSIGN(WallTimeImp);
-};
-
-
-WallTime WallTimeImp::Now() {
-  WallTime now = 0.0;
-  WallTime result = 0.0;
-  int64_t ct = 0;
-  uint32_t top_bits = 0;
-  do {
-    ct = cycleclock::Now();
-    int64_t cycle_delta = ct - base_cycletime_;
-    result = base_walltime_ + cycle_delta * seconds_per_cycle_;
-
-    top_bits = static_cast<uint32_t>(uint64_t(ct) >> 32);
-    // Recompute drift no more often than every 2^32 cycles.
-    // I.e., @2GHz, ~ every two seconds
-    if (top_bits == last_adjust_time_) {  // don't need to recompute drift
-      return result + GetDrift();
-    }
-
-    now = Slow();
-  } while (cycleclock::Now() - ct > max_interval_cycles_);
-  // We are now sure that "now" and "result" were produced within
-  // kMaxErrorInterval of one another.
-
-  SetDrift(static_cast<float>(now - result));
-  last_adjust_time_ = top_bits;
-  return now;
-}
-
-
-WallTimeImp::WallTimeImp()
-    : base_walltime_(0.0), base_cycletime_(0),
-      cycles_per_second_(0), seconds_per_cycle_(0.0),
-      last_adjust_time_(0), drift_adjust_(0),
-      max_interval_cycles_(0) {
-  const double kMaxErrorInterval = 100e-6;
-  cycles_per_second_ = static_cast<int64_t>(CyclesPerSecond());
-  CHECK(cycles_per_second_ != 0);
-  seconds_per_cycle_ = 1.0 / cycles_per_second_;
-  max_interval_cycles_ =
-      static_cast<int64_t>(cycles_per_second_ * kMaxErrorInterval);
-  do {
-    base_cycletime_ = cycleclock::Now();
-    base_walltime_ = Slow();
-  } while (cycleclock::Now() - base_cycletime_ > max_interval_cycles_);
-  // We are now sure that "base_walltime" and "base_cycletime" were produced
-  // within kMaxErrorInterval of one another.
-
-  SetDrift(0.0);
-  last_adjust_time_ = static_cast<uint32_t>(uint64_t(base_cycletime_) >> 32);
-}
-
-WallTime CPUWalltimeNow() {
-  static WallTimeImp& imp = WallTimeImp::GetWallTimeImp();
-  return imp.Now();
-}
-
-WallTime ChronoWalltimeNow() {
-  typedef ChooseClockType::type Clock;
-  typedef std::chrono::duration<WallTime, std::chrono::seconds::period>
-          FPSeconds;
-  static_assert(std::chrono::treat_as_floating_point<WallTime>::value,
-                "This type must be treated as a floating point type.");
-  auto now = Clock::now().time_since_epoch();
-  return std::chrono::duration_cast<FPSeconds>(now).count();
-}
-
-bool UseCpuCycleClock() {
-    bool useWallTime = !CpuScalingEnabled();
-    if (useWallTime) {
-        VLOG(1) << "Using the CPU cycle clock to provide walltime::Now().\n";
-    } else {
-        VLOG(1) << "Using std::chrono to provide walltime::Now().\n";
-    }
-    return useWallTime;
-}
-
-
-} // end anonymous namespace
-
-// WallTimeImp doesn't work when CPU Scaling is enabled. If CPU Scaling is
-// enabled at the start of the program then std::chrono::system_clock is used
-// instead.
-WallTime Now()
-{
-  static bool useCPUClock = UseCpuCycleClock();
-  if (useCPUClock) {
-    return CPUWalltimeNow();
-  } else {
-    return ChronoWalltimeNow();
-  }
-}
-
-}  // end namespace walltime
-
-
-namespace {
-
-std::string DateTimeString(bool local) {
-  typedef std::chrono::system_clock Clock;
-  std::time_t now = Clock::to_time_t(Clock::now());
-  char storage[128];
-  std::size_t written;
-
-  if (local) {
-#if defined(BENCHMARK_OS_WINDOWS)
-    written = std::strftime(storage, sizeof(storage), "%x %X", ::localtime(&now));
-#else
-    std::tm timeinfo;
-    std::memset(&timeinfo, 0, sizeof(std::tm));
-    ::localtime_r(&now, &timeinfo);
-    written = std::strftime(storage, sizeof(storage), "%F %T", &timeinfo);
-#endif
-  } else {
-#if defined(BENCHMARK_OS_WINDOWS)
-    written = std::strftime(storage, sizeof(storage), "%x %X", ::gmtime(&now));
-#else
-    std::tm timeinfo;
-    std::memset(&timeinfo, 0, sizeof(std::tm));
-    ::gmtime_r(&now, &timeinfo);
-    written = std::strftime(storage, sizeof(storage), "%F %T", &timeinfo);
-#endif
-  }
-  CHECK(written < arraysize(storage));
-  ((void)written); // prevent unused variable in optimized mode.
-  return std::string(storage);
-}
-
-} // end namespace
-
-std::string LocalDateTimeString() {
-  return DateTimeString(true);
-}
-
-}  // end namespace benchmark
--- a/src/walltime.h
+++ b/src/walltime.h
@ -1,17 +0,0 @@
-#ifndef BENCHMARK_WALLTIME_H_
-#define BENCHMARK_WALLTIME_H_
-
-#include <string>
-
-namespace benchmark {
-typedef double WallTime;
-
-namespace walltime {
-WallTime Now();
-}  // end namespace walltime
-
-std::string LocalDateTimeString();
-
-}  // end namespace benchmark
-
-#endif  // BENCHMARK_WALLTIME_H_
--- a/tools/gbench/report.py
+++ b/tools/gbench/report.py
@ -59,6 +59,10 @@ def calculate_change(old_val, new_val):
    """
    Return a float representing the decimal change between old_val and new_val.
    """
+    if old_val == 0 and new_val == 0:
+        return 0.0
+    if old_val == 0:
+        return float(new_val - old_val) / (float(old_val + new_val) / 2)
    return float(new_val - old_val) / abs(old_val)


@ -73,7 +77,7 @@ def generate_difference_report(json1, json2, use_color=True):
            if b['name'] == name:
                return b
        return None
-    first_line = "{:<{}s}     Time           CPU".format(
+    first_line = "{:<{}s}     Time           CPU           Old           New".format(
        'Benchmark', first_col_width)
    output_strs = [first_line, '-' * len(first_line)]
    for bn in json1['benchmarks']:
@ -88,12 +92,13 @@ def generate_difference_report(json1, json2, use_color=True):
                return BC_WHITE
            else:
                return BC_CYAN
-        fmt_str = "{}{:<{}s}{endc}    {}{:+.2f}{endc}         {}{:+.2f}{endc}"
+        fmt_str = "{}{:<{}s}{endc}    {}{:+.2f}{endc}         {}{:+.2f}{endc}         {:4d}         {:4d}"
        tres = calculate_change(bn['real_time'], other_bench['real_time'])
        cpures = calculate_change(bn['cpu_time'], other_bench['cpu_time'])
        output_strs += [color_format(use_color, fmt_str,
            BC_HEADER, bn['name'], first_col_width,
            get_color(tres), tres, get_color(cpures), cpures,
+            bn['cpu_time'], other_bench['cpu_time'],
            endc=BC_ENDC)]
    return output_strs