From 403f35442375f2ee858981b79421ca321645df08 Mon Sep 17 00:00:00 2001
From: Dominic Hamon <dominic+github@google.com>
Date: Wed, 18 Dec 2013 16:55:45 -0800
Subject: [PATCH] Initial commit

Benchmark library builds and runs but only single-threaded. Multithreaded
support needs a bit more love.

Currently requires some C++11 support (g++ 4.6.3 seems to work).
---
 .gitignore                    |    6 +
 CMakeLists.txt                |   43 ++
 include/benchmark/benchmark.h |  540 +++++++++++++++
 include/benchmark/macros.h    |  120 ++++
 src/benchmark.cc              | 1197 +++++++++++++++++++++++++++++++++
 src/colorprint.cc             |   82 +++
 src/colorprint.h              |   17 +
 src/commandlineflags.cc       |  213 ++++++
 src/commandlineflags.h        |   79 +++
 src/cycleclock.h              |  129 ++++
 src/macros.h                  |  110 +++
 src/mutex_lock.h              |   20 +
 src/port.h                    |    8 +
 src/sleep.cc                  |   42 ++
 src/sleep.h                   |   10 +
 src/stat.h                    |  306 +++++++++
 src/sysinfo.cc                |  337 ++++++++++
 src/sysinfo.h                 |    9 +
 src/walltime.cc               |  137 ++++
 src/walltime.h                |   19 +
 test/benchmark_test.cc        |  138 ++++
 21 files changed, 3562 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 CMakeLists.txt
 create mode 100644 include/benchmark/benchmark.h
 create mode 100644 include/benchmark/macros.h
 create mode 100644 src/benchmark.cc
 create mode 100644 src/colorprint.cc
 create mode 100644 src/colorprint.h
 create mode 100644 src/commandlineflags.cc
 create mode 100644 src/commandlineflags.h
 create mode 100644 src/cycleclock.h
 create mode 100644 src/macros.h
 create mode 100644 src/mutex_lock.h
 create mode 100644 src/port.h
 create mode 100644 src/sleep.cc
 create mode 100644 src/sleep.h
 create mode 100644 src/stat.h
 create mode 100644 src/sysinfo.cc
 create mode 100644 src/sysinfo.h
 create mode 100644 src/walltime.cc
 create mode 100644 src/walltime.h
 create mode 100644 test/benchmark_test.cc

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 00000000..5dfc6384
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,6 @@
+CMakeCache.txt
+CMakeFiles/
+Makefile
+bin/
+cmake_install.cmake
+lib/
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 00000000..f363a476
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,43 @@
+cmake_minimum_required (VERSION 2.8)
+project (benchmark)
+
+find_package(Threads)
+
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR}/bin)
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR}/lib)
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR}/lib)
+
+set(CMAKE_CXX_FLAGS "-Wall -Werror --std=c++0x")
+set(CMAKE_CXX_FLAGS_DEBUG "-g -O0 -DDEBUG")
+set(CMAKE_CXX_FLAGS_RELEASE "-fno-strict-aliasing -O3 -DNDEBUG")
+
+# Set OS
+if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
+	add_definitions(-DOS_MACOSX)
+endif()
+
+if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
+	add_definitions(-DOS_LINUX)
+endif()
+
+if(${CMAKE_SYSTEM_NAME} MATCHES "Windows")
+	add_definitions(-DOS_WINDOWS)
+endif()
+
+# Set CPU
+if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86")
+	add_definitions(-DARCH_X86)
+endif()
+
+# Set up directories
+include_directories(${PROJECT_SOURCE_DIR}/include)
+include_directories(${PROJECT_SOURCE_DIR}/src)
+link_directories(${PROJECT_SOURCE_DIR}/lib)
+
+# Build the targets
+FILE(GLOB SOURCE_FILES "src/*.cc")
+add_library(benchmark STATIC ${SOURCE_FILES})
+
+add_executable(benchmark_test test/benchmark_test.cc)
+target_link_libraries(benchmark_test benchmark ${CMAKE_THREAD_LIBS_INIT})
+
diff --git a/include/benchmark/benchmark.h b/include/benchmark/benchmark.h
new file mode 100644
index 00000000..d5dec23c
--- /dev/null
+++ b/include/benchmark/benchmark.h
@@ -0,0 +1,540 @@
+// Support for registering benchmarks for functions.
+
+/* Example usage:
+// Define a function that executes the code to be measured a
+// specified number of times:
+static void BM_StringCreation(benchmark::State& state) {
+  while (state.KeepRunning())
+    std::string empty_string;
+}
+
+// Register the function as a benchmark
+BENCHMARK(BM_StringCreation);
+
+// Define another benchmark
+static void BM_StringCopy(benchmark::State& state) {
+  std::string x = "hello";
+  while (state.KeepRunning())
+    std::string copy(x);
+}
+BENCHMARK(BM_StringCopy);
+
+// Augment the main() program to invoke benchmarks if specified
+// via the --benchmarks command line flag.  E.g.,
+//       my_unittest --benchmarks=all
+//       my_unittest --benchmarks=BM_StringCreation
+//       my_unittest --benchmarks=String
+//       my_unittest --benchmarks='Copy|Creation'
+int main(int argc, char** argv) {
+  Initialize(&argc, argv);
+
+  RunSpecifiedBenchmarks();
+}
+
+// Sometimes a family of microbenchmarks can be implemented with
+// just one routine that takes an extra argument to specify which
+// one of the family of benchmarks to run.  For example, the following
+// code defines a family of microbenchmarks for measuring the speed
+// of memcpy() calls of different lengths:
+
+static void BM_memcpy(benchmark::State& state) {
+  char* src = new char[state.range_x()]; char* dst = new char[state.range_x()];
+  memset(src, 'x', state.range_x());
+  while (state.KeepRunning()) {
+    memcpy(dst, src, state.range_x());
+  SetBenchmarkBytesProcessed(int64_t_t(state.iterations) * int64(state.range_x()));
+  delete[] src; delete[] dst;
+}
+BENCHMARK(BM_memcpy)->Arg(8)->Arg(64)->Arg(512)->Arg(1<<10)->Arg(8<<10);
+
+// The preceding code is quite repetitive, and can be replaced with the
+// following short-hand.  The following invocation will pick a few
+// appropriate arguments in the specified range and will generate a
+// microbenchmark for each such argument.
+BENCHMARK(BM_memcpy)->Range(8, 8<<10);
+
+// You might have a microbenchmark that depends on two inputs.  For
+// example, the following code defines a family of microbenchmarks for
+// measuring the speed of set insertion.
+static void BM_SetInsert(benchmark::State& state) {
+  while (state.KeepRunning()) {
+    state.PauseTiming();
+    set<int> data = ConstructRandomSet(state.range_x());
+    state.ResumeTiming();
+    for (int j = 0; j < state.rangeY; ++j)
+      data.insert(RandomNumber());
+  }
+}
+BENCHMARK(BM_SetInsert)
+   ->ArgPair(1<<10, 1)
+   ->ArgPair(1<<10, 8)
+   ->ArgPair(1<<10, 64)
+   ->ArgPair(1<<10, 512)
+   ->ArgPair(8<<10, 1)
+   ->ArgPair(8<<10, 8)
+   ->ArgPair(8<<10, 64)
+   ->ArgPair(8<<10, 512);
+
+// The preceding code is quite repetitive, and can be replaced with
+// the following short-hand.  The following macro will pick a few
+// appropriate arguments in the product of the two specified ranges
+// and will generate a microbenchmark for each such pair.
+BENCHMARK(BM_SetInsert)->RangePair(1<<10, 8<<10, 1, 512);
+
+// For more complex patterns of inputs, passing a custom function
+// to Apply allows programmatic specification of an
+// arbitrary set of arguments to run the microbenchmark on.
+// The following example enumerates a dense range on
+// one parameter, and a sparse range on the second.
+static benchmark::internal::Benchmark* CustomArguments(
+    benchmark::internal::Benchmark* b) {
+  for (int i = 0; i <= 10; ++i)
+    for (int j = 32; j <= 1024*1024; j *= 8)
+      b = b->ArgPair(i, j);
+  return b;
+}
+BENCHMARK(BM_SetInsert)->Apply(CustomArguments);
+
+// Templated microbenchmarks work the same way:
+// Produce then consume 'size' messages 'iters' times
+// Measures throughput in the absence of multiprogramming.
+template <class Q> int BM_Sequential(benchmark::State& state) {
+  Q q;
+  typename Q::value_type v;
+  while (state.KeepRunning()) {
+    for (int i = state.range_x(); i--; )
+      q.push(v);
+    for (int e = state.range_x(); e--; )
+      q.Wait(&v);
+  }
+  // actually messages, not bytes:
+  state.SetBytesProcessed(
+      static_cast<int64_t>(state.iterations())*state.range_x());
+}
+BENCHMARK_TEMPLATE(BM_Sequential, WaitQueue<int>)->Range(1<<0, 1<<10);
+
+In a multithreaded test, it is guaranteed that none of the threads will start
+until all have called KeepRunning, and all will have finished before KeepRunning
+returns false. As such, any global setup or teardown you want to do can be
+wrapped in a check against the thread index:
+
+static void BM_MultiThreaded(benchmark::State& state) {
+  if (state.thread_index == 0) {
+    // Setup code here.
+  }
+  while (state.KeepRunning()) {
+    // Run the test as normal.
+  }
+  if (state.thread_index == 0) {
+    // Teardown code here.
+  }
+}
+*/
+
+#ifndef BENCHMARK_BENCHMARK_H_
+#define BENCHMARK_BENCHMARK_H_
+
+#include <stdint.h>
+
+#include <functional>
+#include <string>
+#include <vector>
+
+#include "macros.h"
+
+namespace benchmark {
+// If the --benchmarks flag is empty, do nothing.
+//
+// Otherwise, run all benchmarks specified by the --benchmarks flag,
+// and exit after running the benchmarks.
+extern void RunSpecifiedBenchmarks();
+
+// ------------------------------------------------------
+// Routines that can be called from within a benchmark
+
+//
+// REQUIRES: a benchmark is currently executing
+extern void SetLabel(const std::string& label);
+
+// If this routine is called, peak memory allocation past this point in the
+// benchmark is reported at the end of the benchmark report line. (It is
+// computed by running the benchmark once with a single iteration and a memory
+// tracer.)
+extern void MemoryUsage();
+
+// If a particular benchmark is I/O bound, or if for some reason CPU
+// timings are not representative, call this method from within the
+// benchmark routine.  If called, the elapsed time will be used to
+// control how many iterations are run, and in the printing of
+// items/second or MB/seconds values.  If not called, the cpu time
+// used by the benchmark will be used.
+extern void UseRealTime();
+
+namespace internal {
+class Benchmark;
+}
+
+// State is passed to a running Benchmark and contains state for the
+// benchmark to use.
+class State {
+ public:
+  // Returns true iff the benchmark should continue through another iteration.
+  bool KeepRunning();
+
+  void PauseTiming();
+  void ResumeTiming();
+
+  // Set the number of bytes processed by the current benchmark
+  // execution.  This routine is typically called once at the end of a
+  // throughput oriented benchmark.  If this routine is called with a
+  // value > 0, the report is printed in MB/sec instead of nanoseconds
+  // per iteration.
+  //
+  // REQUIRES: a benchmark has exited its KeepRunning loop.
+  void SetBytesProcessed(int64_t bytes);
+  
+  // If this routine is called with items > 0, then an items/s
+  // label is printed on the benchmark report line for the currently
+  // executing benchmark. It is typically called at the end of a processing
+  // benchmark where a processing items/second output is desired.
+  //
+  // REQUIRES: a benchmark has exited its KeepRunning loop.
+  void SetItemsProcessed(int64_t items);
+
+  // If this routine is called, the specified label is printed at the
+  // end of the benchmark report line for the currently executing
+  // benchmark.  Example:
+  //  static void BM_Compress(int iters) {
+  //    ...
+  //    double compress = input_size / output_size;
+  //    benchmark::SetLabel(StringPrintf("compress:%.1f%%", 100.0*compression));
+  //  }
+  // Produces output that looks like:
+  //  BM_Compress   50         50   14115038  compress:27.3%
+  //
+  // REQUIRES: a benchmark has exited its KeepRunning loop.
+  void SetLabel(const std::string& label);
+
+  // Range arguments for this run. CHECKs if the argument has been set.
+  int range_x() const;
+  int range_y() const;
+
+  int iterations() const { return total_iterations_; }
+
+  const int thread_index;
+
+ private:
+  class FastClock;
+  struct SharedState;
+
+  State(FastClock* clock, SharedState* s, int t);
+  bool StartRunning();
+  bool FinishInterval();
+  bool MaybeStop();
+  void NewInterval();
+  bool AllStarting();
+  bool RunAnotherInterval() const;
+
+  void Run();
+
+  enum EState {
+    STATE_INITIAL,  // KeepRunning hasn't been called
+    STATE_STARTING, // KeepRunning called, waiting for other threads
+    STATE_RUNNING,  // Running and being timed
+    STATE_STOPPING, // Not being timed but waiting for other threads
+    STATE_STOPPED,  // Stopped
+  } state_;
+
+  FastClock* clock_;
+
+  // State shared by all BenchmarkRun objects that belong to the same
+  // BenchmarkInstance
+  SharedState* shared_;
+
+  // Custom label set by the user.
+  std::string label_;
+
+  // Each State object goes through a sequence of measurement intervals. By
+  // default each interval is approx. 100ms in length. The following stats are
+  // kept for each interval.
+  int64_t iterations_;
+  double start_cpu_;
+  double start_time_;
+  int64_t stop_time_micros_;
+
+  double start_pause_;
+  double pause_time_;
+
+  // Total number of iterations for all finished runs.
+  int64_t total_iterations_;
+
+  // Approximate time in microseconds for one interval of execution.
+  // Dynamically adjusted as needed.
+  int64_t interval_micros_;
+
+  // True if the current interval is the continuation of a previous one.
+  bool is_continuation_;
+
+  friend class internal::Benchmark;
+  DISALLOW_COPY_AND_ASSIGN(State);
+};
+
+namespace internal {
+class BenchmarkReporter;
+
+typedef std::function<void(State&)> BenchmarkFunction;
+
+// Run all benchmarks whose name is a partial match for the regular
+// expression in "spec". The results of benchmark runs are fed to "reporter".
+void RunMatchingBenchmarks(const std::string& spec,
+                           BenchmarkReporter* reporter);
+
+// Extract the list of benchmark names that match the specified regular
+// expression.
+void FindMatchingBenchmarkNames(const std::string& re,
+                                std::vector<std::string>* benchmark_names);
+
+// ------------------------------------------------------
+// Benchmark registration object.  The BENCHMARK() macro expands
+// into an internal::Benchmark* object.  Various methods can
+// be called on this object to change the properties of the benchmark.
+// Each method returns "this" so that multiple method calls can
+// chained into one expression.
+class Benchmark {
+ public:
+  // The Benchmark takes ownership of the Callback pointed to by f.
+  Benchmark(const char* name, BenchmarkFunction f);
+
+  ~Benchmark();
+
+  // Note: the following methods all return "this" so that multiple
+  // method calls can be chained together in one expression.
+
+  // Run this benchmark once with "x" as the extra argument passed
+  // to the function.
+  // REQUIRES: The function passed to the constructor must accept an arg1.
+  Benchmark* Arg(int x);
+
+  // Run this benchmark once for a number of values picked from the
+  // range [start..limit].  (start and limit are always picked.)
+  // REQUIRES: The function passed to the constructor must accept an arg1.
+  Benchmark* Range(int start, int limit);
+
+  // Run this benchmark once for every value in the range [start..limit]
+  // REQUIRES: The function passed to the constructor must accept an arg1.
+  Benchmark* DenseRange(int start, int limit);
+
+  // Run this benchmark once with "x,y" as the extra arguments passed
+  // to the function.
+  // REQUIRES: The function passed to the constructor must accept arg1,arg2.
+  Benchmark* ArgPair(int x, int y);
+
+  // Pick a set of values A from the range [lo1..hi1] and a set
+  // of values B from the range [lo2..hi2].  Run the benchmark for
+  // every pair of values in the cartesian product of A and B
+  // (i.e., for all combinations of the values in A and B).
+  // REQUIRES: The function passed to the constructor must accept arg1,arg2.
+  Benchmark* RangePair(int lo1, int hi1, int lo2, int hi2);
+
+  // Pass this benchmark object to *func, which can customize
+  // the benchmark by calling various methods like Arg, ArgPair,
+  // Threads, etc.
+  Benchmark* Apply(void (*func)(Benchmark* benchmark));
+
+  // Support for running multiple copies of the same benchmark concurrently
+  // in multiple threads.  This may be useful when measuring the scaling
+  // of some piece of code.
+
+  // Run one instance of this benchmark concurrently in t threads.
+  Benchmark* Threads(int t);
+
+  // Pick a set of values T from [min_threads,max_threads].
+  // min_threads and max_threads are always included in T.  Run this
+  // benchmark once for each value in T.  The benchmark run for a
+  // particular value t consists of t threads running the benchmark
+  // function concurrently.  For example, consider:
+  //    BENCHMARK(Foo)->ThreadRange(1,16);
+  // This will run the following benchmarks:
+  //    Foo in 1 thread
+  //    Foo in 2 threads
+  //    Foo in 4 threads
+  //    Foo in 8 threads
+  //    Foo in 16 threads
+  Benchmark* ThreadRange(int min_threads, int max_threads);
+
+  // Equivalent to ThreadRange(NumCPUs(), NumCPUs())
+  Benchmark* ThreadPerCpu();
+
+  // TODO(dominich): Control whether or not real-time is used for this benchmark
+  // TODO(dominich): Control the default number of iterations
+
+  // -------------------------------
+  // Following methods are not useful for clients
+
+  // Used inside the benchmark implementation
+  struct Instance;
+  struct ThreadStats;
+
+  // Extract the list of benchmark instances that match the specified
+  // regular expression.
+  static void FindBenchmarks(const std::string& re,
+                             std::vector<Instance>* benchmarks);
+
+  // Measure the overhead of an empty benchmark to subtract later.
+  static void MeasureOverhead();
+
+ private:
+  std::vector<Benchmark::Instance> CreateBenchmarkInstances(int rangeXindex,
+                                                            int rangeYindex);
+
+  std::string name_;
+  BenchmarkFunction function_;
+  int registration_index_;
+  std::vector<int> rangeX_;
+  std::vector<int> rangeY_;
+  std::vector<int> thread_counts_;
+
+  // Special value placed in thread_counts_ to stand for NumCPUs()
+  static const int kNumCpuMarker = -1;
+
+  // Special value used to indicate that no range is required.
+  static const int kNoRange = -1;
+
+  static void AddRange(std::vector<int>* dst, int lo, int hi, int mult);
+  static double MeasurePeakHeapMemory(const Instance& b);
+  static void RunInstance(const Instance& b, BenchmarkReporter* br);
+  friend class ::benchmark::State;
+  friend struct ::benchmark::internal::Benchmark::Instance;
+  friend void ::benchmark::internal::RunMatchingBenchmarks(
+      const std::string&, BenchmarkReporter*);
+  DISALLOW_COPY_AND_ASSIGN(Benchmark);
+};
+
+// ------------------------------------------------------
+// Benchmarks reporter interface + data containers.
+
+struct BenchmarkContextData {
+  int num_cpus;
+  double mhz_per_cpu;
+  //std::string cpu_info;
+  bool cpu_scaling_enabled;
+
+  // The number of chars in the longest benchmark name.
+  int name_field_width;
+};
+
+struct BenchmarkRunData {
+  BenchmarkRunData() :
+      thread_index(-1),
+      iterations(1),
+      real_accumulated_time(0),
+      cpu_accumulated_time(0),
+      bytes_per_second(0),
+      items_per_second(0),
+      max_heapbytes_used(0) {}
+
+  std::string benchmark_name;
+  std::string report_label;
+  int thread_index;
+  int64_t iterations;
+  double real_accumulated_time;
+  double cpu_accumulated_time;
+
+  // Zero if not set by benchmark.
+  double bytes_per_second;
+  double items_per_second;
+
+  // This is set to 0.0 if memory tracing is not enabled.
+  double max_heapbytes_used;
+};
+
+// Interface for custom benchmark result printers.
+// By default, benchmark reports are printed to stdout. However an application
+// can control the destination of the reports by calling
+// RunMatchingBenchmarks and passing it a custom reporter object.
+// The reporter object must implement the following interface.
+class BenchmarkReporter {
+ public:
+  // Called once for every suite of benchmarks run.
+  // The parameter "context" contains information that the
+  // reporter may wish to use when generating its report, for example the
+  // platform under which the benchmarks are running. The benchmark run is
+  // never started if this function returns false, allowing the reporter
+  // to skip runs based on the context information.
+  virtual bool ReportContext(const BenchmarkContextData& context) = 0;
+
+  // Called once for each group of benchmark runs, gives information about
+  // cpu-time and heap memory usage during the benchmark run.
+  // Note that all the grouped benchmark runs should refer to the same
+  // benchmark, thus have the same name.
+  virtual void ReportRuns(const std::vector<BenchmarkRunData>& report) = 0;
+
+  virtual ~BenchmarkReporter();
+};
+
+
+// ------------------------------------------------------
+// Internal implementation details follow; please ignore
+
+// Given a collection of reports, computes their mean and stddev.
+// REQUIRES: all runs in "reports" must be from the same benchmark.
+void ComputeStats(const std::vector<BenchmarkRunData>& reports,
+                  BenchmarkRunData* mean_data,
+                  BenchmarkRunData* stddev_data);
+
+// Simple reporter that outputs benchmark data to the console. This is the
+// default reporter used by RunSpecifiedBenchmarks().
+class ConsoleReporter : public BenchmarkReporter {
+ public:
+  virtual bool ReportContext(const BenchmarkContextData& context);
+  virtual void ReportRuns(const std::vector<BenchmarkRunData>& reports);
+ private:
+  std::string PrintMemoryUsage(double bytes);
+  virtual void PrintRunData(const BenchmarkRunData& report);
+  int name_field_width_;
+};
+
+}  // end namespace internal
+
+void Initialize(int* argc, const char** argv);
+}  // end namespace benchmark
+
+// ------------------------------------------------------
+// Macro to register benchmarks
+
+// Helpers for generating unique variable names
+#define BENCHMARK_CONCAT(a, b, c) BENCHMARK_CONCAT2(a, b, c)
+#define BENCHMARK_CONCAT2(a, b, c) a ## b ## c
+
+#define BENCHMARK(n)                                              \
+    static ::benchmark::internal::Benchmark*                      \
+  BENCHMARK_CONCAT(__benchmark_, n, __LINE__) ATTRIBUTE_UNUSED =  \
+      (new ::benchmark::internal::Benchmark(#n, n))
+
+// Old-style macros
+#define BENCHMARK_WITH_ARG(n, a) BENCHMARK(n)->Arg((a))
+#define BENCHMARK_WITH_ARG2(n, a1, a2) BENCHMARK(n)->ArgPair((a1), (a2))
+#define BENCHMARK_RANGE(n, lo, hi) BENCHMARK(n)->Range((lo), (hi))
+#define BENCHMARK_RANGE2(n, l1, h1, l2, h2) \
+    BENCHMARK(n)->RangePair((l1), (h1), (l2), (h2))
+
+// This will register a benchmark for a templatized function.  For example:
+//
+// template<int arg>
+// void BM_Foo(int iters);
+//
+// BENCHMARK_TEMPLATE(BM_Foo, 1);
+//
+// will register BM_Foo<1> as a benchmark.
+#define BENCHMARK_TEMPLATE(n, a)                                  \
+    static ::benchmark::internal::Benchmark*                      \
+  BENCHMARK_CONCAT(__benchmark_, n, __LINE__) ATTRIBUTE_UNUSED =  \
+      (new ::benchmark::internal::Benchmark(#n "<" #a ">", n<a>))
+
+#define BENCHMARK_TEMPLATE2(n, a, b)                                        \
+  static ::benchmark::internal::Benchmark*                                  \
+  BENCHMARK_CONCAT(__benchmark_, n, __LINE__) ATTRIBUTE_UNUSED =            \
+      (new ::benchmark::internal::Benchmark(#n "<" #a "," #b ">", n<a, b>))
+
+#endif  // BENCHMARK_BENCHMARK_H_
+
diff --git a/include/benchmark/macros.h b/include/benchmark/macros.h
new file mode 100644
index 00000000..8c2df946
--- /dev/null
+++ b/include/benchmark/macros.h
@@ -0,0 +1,120 @@
+#ifndef BENCHMARK_MACROS_H_
+#define BENCHMARK_MACROS_H_
+
+#include <assert.h>
+
+#define DISALLOW_COPY_AND_ASSIGN(TypeName) \
+  TypeName(const TypeName&);               \
+  void operator=(const TypeName&);
+
+// The arraysize(arr) macro returns the # of elements in an array arr.
+// The expression is a compile-time constant, and therefore can be
+// used in defining new arrays, for example.  If you use arraysize on
+// a pointer by mistake, you will get a compile-time error.
+//
+// One caveat is that, for C++03, arraysize() doesn't accept any array of
+// an anonymous type or a type defined inside a function.  In these rare
+// cases, you have to use the unsafe ARRAYSIZE() macro below.  This is
+// due to a limitation in C++03's template system.  The limitation has
+// been removed in C++11.
+
+// This template function declaration is used in defining arraysize.
+// Note that the function doesn't need an implementation, as we only
+// use its type.
+template <typename T, size_t N>
+char (&ArraySizeHelper(T (&array)[N]))[N];
+
+// That gcc wants both of these prototypes seems mysterious. VC, for
+// its part, can't decide which to use (another mystery). Matching of
+// template overloads: the final frontier.
+#ifndef COMPILER_MSVC
+template <typename T, size_t N>
+char (&ArraySizeHelper(const T (&array)[N]))[N];
+#endif
+
+#define arraysize(array) (sizeof(ArraySizeHelper(array)))
+
+// The STATIC_ASSERT macro can be used to verify that a compile time
+// expression is true. For example, you could use it to verify the
+// size of a static array:
+//
+//   STATIC_ASSERT(ARRAYSIZE(content_type_names) == CONTENT_NUM_TYPES,
+//                  content_type_names_incorrect_size);
+//
+// or to make sure a struct is smaller than a certain size:
+//
+//   STATIC_ASSERT(sizeof(foo) < 128, foo_too_large);
+//
+// The second argument to the macro is the name of the variable. If
+// the expression is false, most compilers will issue a warning/error
+// containing the name of the variable.
+
+template <bool>
+struct StaticAssert {
+};
+
+#define STATIC_ASSERT(expr, msg) \
+  typedef StaticAssert<(bool(expr))> msg[bool(expr) ? 1 : -1]
+
+// Implementation details of STATIC_ASSERT:
+//
+// - STATIC_ASSERT works by defining an array type that has -1
+//   elements (and thus is invalid) when the expression is false.
+//
+// - The simpler definition
+//
+//     #define STATIC_ASSERT(expr, msg) typedef char msg[(expr) ? 1 : -1]
+//
+//   does not work, as gcc supports variable-length arrays whose sizes
+//   are determined at run-time (this is gcc's extension and not part
+//   of the C++ standard).  As a result, gcc fails to reject the
+//   following code with the simple definition:
+//
+//     int foo;
+//     STATIC_ASSERT(foo, msg); // not supposed to compile as foo is
+//                               // not a compile-time constant.
+//
+// - By using the type StaticAssert<(bool(expr))>, we ensures that
+//   expr is a compile-time constant.  (Template arguments must be
+//   determined at compile-time.)
+//
+// - The outer parentheses in StaticAssert<(bool(expr))> are necessary
+//   to work around a bug in gcc 3.4.4 and 4.0.1.  If we had written
+//
+//     StaticAssert<bool(expr)>
+//
+//   instead, these compilers will refuse to compile
+//
+//     STATIC_ASSERT(5 > 0, some_message);
+//
+//   (They seem to think the ">" in "5 > 0" marks the end of the
+//   template argument list.)
+//
+// - The array size is (bool(expr) ? 1 : -1), instead of simply
+//
+//     ((expr) ? 1 : -1).
+//
+//   This is to avoid running into a bug in MS VC 7.1, which
+//   causes ((0.0) ? 1 : -1) to incorrectly evaluate to 1.
+
+#define CHECK(b) do { if (!(b)) assert(false); } while(0)
+#define CHECK_EQ(a, b) CHECK((a) == (b))
+#define CHECK_GE(a, b) CHECK((a) >= (b))
+#define CHECK_LE(a, b) CHECK((a) <= (b))
+#define CHECK_GT(a, b) CHECK((a) > (b))
+#define CHECK_LT(a, b) CHECK((a) < (b))
+
+//
+// Prevent the compiler from complaining about or optimizing away variables
+// that appear unused.
+#define ATTRIBUTE_UNUSED __attribute__ ((unused))
+
+//
+// For functions we want to force inline or not inline.
+// Introduced in gcc 3.1.
+#define ATTRIBUTE_ALWAYS_INLINE  __attribute__ ((always_inline))
+#define HAVE_ATTRIBUTE_ALWAYS_INLINE 1
+#define ATTRIBUTE_NOINLINE __attribute__ ((noinline))
+#define HAVE_ATTRIBUTE_NOINLINE 1
+
+#endif  // BENCHMARK_MACROS_H_
diff --git a/src/benchmark.cc b/src/benchmark.cc
new file mode 100644
index 00000000..7a4de8e3
--- /dev/null
+++ b/src/benchmark.cc
@@ -0,0 +1,1197 @@
+#include "benchmark/benchmark.h"
+#include "benchmark/macros.h"
+#include "colorprint.h"
+#include "commandlineflags.h"
+#include "mutex_lock.h"
+#include "sleep.h"
+#include "stat.h"
+#include "sysinfo.h"
+#include "walltime.h"
+
+#include <pthread.h>
+#include <semaphore.h>
+#include <string.h>
+
+#if defined OS_FREEBSD
+#include <gnuregex.h>
+#else
+#include <regex.h>
+#endif
+
+#include <algorithm>
+#include <iostream>
+#include <memory>
+#include <sstream>
+
+DEFINE_string(benchmark_filter, ".",
+              "A regular expression that specifies the set of benchmarks "
+              "to execute.  If this flag is empty, no benchmarks are run.  "
+              "If this flag is the string \"all\", all benchmarks linked "
+              "into the process are run.");
+
+DEFINE_int32(benchmark_min_iters, 100,
+             "Minimum number of iterations per benchmark");
+
+DEFINE_int32(benchmark_max_iters, 1000000000,
+             "Maximum number of iterations per benchmark");
+
+DEFINE_double(benchmark_min_time, 0.5,
+              "Minimum number of seconds we should run benchmark before "
+              "results are considered significant.  For cpu-time based "
+              "tests, this is the lower bound on the total cpu time "
+              "used by all threads that make up the test.  For real-time "
+              "based tests, this is the lower bound on the elapsed time "
+              "of the benchmark execution, regardless of number of "
+              "threads.");
+
+DEFINE_bool(benchmark_memory_usage, false,
+            "Report memory usage for all benchmarks");
+
+DEFINE_int32(benchmark_repetitions, 1,
+             "The number of runs of each benchmark. If greater than 1, the "
+             "mean and standard deviation of the runs will be reported.");
+
+DEFINE_int32(v, 0, "The level of verbose logging to output");
+DEFINE_bool(color_print, true, "Enables colorized logging.");
+
+// Will be non-empty if heap checking is turned on, which would
+// invalidate any benchmarks.
+DECLARE_string(heap_check);
+
+// The ""'s catch people who don't pass in a literal for "str"
+#define strliterallen(str) (sizeof("" str "")-1)
+
+// Must use a string literal for prefix.
+#define memprefix(str, len, prefix)                         \
+  ( (((len) >= strliterallen(prefix))                       \
+     && memcmp(str, prefix, strliterallen(prefix)) == 0)    \
+    ? str + strliterallen(prefix)                           \
+    : NULL )
+
+namespace benchmark {
+
+namespace {
+
+// kilo, Mega, Giga, Tera, Peta, Exa, Zetta, Yotta.
+static const char kBigSIUnits[] = "kMGTPEZY";
+// Kibi, Mebi, Gibi, Tebi, Pebi, Exbi, Zebi, Yobi.
+static const char kBigIECUnits[] = "KMGTPEZY";
+// milli, micro, nano, pico, femto, atto, zepto, yocto.
+static const char kSmallSIUnits[] = "munpfazy";
+
+// We require that all three arrays have the same size.
+STATIC_ASSERT(arraysize(kBigSIUnits) == arraysize(kBigIECUnits),
+              SI_and_IEC_unit_arrays_must_be_the_same_size);
+STATIC_ASSERT(arraysize(kSmallSIUnits) == arraysize(kBigSIUnits),
+              Small_SI_and_Big_SI_unit_arrays_must_be_the_same_size);
+static const int kUnitsSize = arraysize(kBigSIUnits);
+
+void ToExponentAndMantissa(double val, double thresh,
+                           int precision, double one_k,
+                           std::string* mantissa, int* exponent) {
+  std::stringstream mantissa_stream;
+
+  if (val < 0) {
+    mantissa_stream << "-";
+    val = -val;
+  }
+
+  // Adjust threshold so that it never excludes things which can't be rendered
+  // in 'precision' digits.
+  const double adjusted_threshold =
+      std::max(thresh, 1.0 / pow(10.0, precision));
+  const double big_threshold = adjusted_threshold * one_k;
+  const double small_threshold = adjusted_threshold;
+
+  if (val > big_threshold) {
+    // Positive powers
+    double scaled = val;
+    for (size_t i = 0; i < arraysize(kBigSIUnits); ++i) {
+      scaled /= one_k;
+      if (scaled <= big_threshold) {
+        mantissa_stream << scaled;
+        *exponent = i + 1;
+        *mantissa = mantissa_stream.str();
+        return;
+      }
+    }
+    mantissa_stream << val;
+    *exponent = 0;
+  } else if (val < small_threshold) {
+    // Negative powers
+    double scaled = val;
+    for (size_t i = 0; i < arraysize(kSmallSIUnits); ++i) {
+      scaled *= one_k;
+      if (scaled >= small_threshold) {
+        mantissa_stream << scaled;
+        *exponent = -i - 1;
+        *mantissa = mantissa_stream.str();
+        return;
+      }
+    }
+    mantissa_stream << val;
+    *exponent = 0;
+  } else {
+    mantissa_stream << val;
+    *exponent = 0;
+  }
+  *mantissa = mantissa_stream.str();
+}
+
+std::string ExponentToPrefix(int exponent, bool iec) {
+  if (exponent == 0)
+    return "";
+
+  const int index = (exponent > 0 ? exponent - 1 : -exponent - 1);
+  if (index >= kUnitsSize)
+    return "";
+
+  const char *array = (exponent > 0 ? (iec ? kBigIECUnits : kBigSIUnits) :
+                       kSmallSIUnits);
+  if (iec)
+    return array[index] + std::string("i");
+  else
+    return std::string(1, array[index]);
+}
+
+std::string ToBinaryStringFullySpecified(double value, double threshold,
+                                         int precision) {
+  std::string mantissa;
+  int exponent;
+  ToExponentAndMantissa(value, threshold, precision, 1024., &mantissa,
+                        &exponent);
+  return mantissa + ExponentToPrefix(exponent, false);
+}
+
+inline void AppendHumanReadable(int n, std::string* str) {
+  std::stringstream ss;
+  // Round down to the nearest SI prefix.
+  ss << "/" << ToBinaryStringFullySpecified(n, 1.0, 0);
+  *str += ss.str();
+}
+
+inline std::string HumanReadableNumber(double n) {
+  // 1.1 means that figures up to 1.1k should be shown with the next unit down;
+  // this softens edge effects.
+  // 1 means that we should show one decimal place of precision.
+  return ToBinaryStringFullySpecified(n, 1.1, 1);
+}
+
+}  // end namespace
+
+namespace internal {
+struct Benchmark::ThreadStats {
+  int64_t bytes_processed;
+  int64_t items_processed;
+
+  ThreadStats() { Reset(); }
+
+  void Reset() {
+    bytes_processed = 0;
+    items_processed = 0;
+  }
+
+  void Add(const ThreadStats& other) {
+    bytes_processed += other.bytes_processed;
+    items_processed += other.items_processed;
+  }
+};
+
+}  // end namespace internal
+
+namespace {
+
+// Per-thread stats
+pthread_key_t thread_stats_key;
+internal::Benchmark::ThreadStats* thread_stats = nullptr;
+
+// For non-dense Range, intermediate values are powers of kRangeMultiplier.
+static const int kRangeMultiplier = 8;
+
+// List of all registered benchmarks.  Note that each registered
+// benchmark identifies a family of related benchmarks to run.
+static pthread_mutex_t benchmark_mutex;
+static std::vector<internal::Benchmark*>* families = NULL;
+
+bool running_benchmark = false;
+
+// Should this benchmark report memory usage?
+bool get_memory_usage;
+
+// Should this benchmark base decisions off of real time rather than
+// cpu time?
+bool use_real_time;
+
+// Overhead of an empty benchmark.
+double overhead = 0.0;
+
+void DeleteThreadStats(void* p) {
+  delete (internal::Benchmark::ThreadStats*) p;
+}
+
+// Return prefix to print in front of each reported line
+const char* Prefix() {
+#ifdef NDEBUG
+  return "";
+#else
+  return "DEBUG: ";
+#endif
+}
+
+// TODO
+//static internal::MallocCounter *benchmark_mc;
+
+static bool CpuScalingEnabled() {
+  // On Linux, the CPUfreq subsystem exposes CPU information as files on the
+  // local file system. If reading the exported files fails, then we may not be
+  // running on Linux, so we silently ignore all the read errors.
+  for (int cpu = 0, num_cpus = NumCPUs(); cpu < num_cpus; ++cpu) {
+    std::stringstream ss;
+    ss << "/sys/devices/system/cpu/cpu" << cpu << "/cpufreq/scaling_governor";
+    std::string governor_file = ss.str();
+    FILE* file = fopen(governor_file.c_str(), "r");
+    if (!file)
+      break;
+    char buff[16];
+    size_t bytes_read = fread(buff, 1, sizeof(buff), file);
+    fclose(file);
+    if (memprefix(buff, bytes_read, "performance") == NULL)
+      return true;
+  }
+  return false;
+}
+
+}  // namespace
+
+namespace internal {
+
+BenchmarkReporter::~BenchmarkReporter() {}
+
+void ComputeStats(const std::vector<BenchmarkRunData>& reports,
+                  BenchmarkRunData* mean_data,
+                  BenchmarkRunData* stddev_data) {
+  // Accumulators.
+  Stat1_d real_accumulated_time_stat;
+  Stat1_d cpu_accumulated_time_stat;
+  Stat1_d bytes_per_second_stat;
+  Stat1_d items_per_second_stat;
+  Stat1MinMax_d max_heapbytes_used_stat;
+  int total_iters = 0;
+
+  // Populate the accumulators.
+  for (std::vector<BenchmarkRunData>::const_iterator it = reports.begin();
+       it != reports.end(); ++it) {
+    CHECK_EQ(reports[0].benchmark_name, it->benchmark_name);
+    total_iters += it->iterations;
+    real_accumulated_time_stat +=
+        Stat1_d(it->real_accumulated_time/it->iterations, it->iterations);
+    cpu_accumulated_time_stat +=
+        Stat1_d(it->cpu_accumulated_time/it->iterations, it->iterations);
+    items_per_second_stat += Stat1_d(it->items_per_second, it->iterations);
+    bytes_per_second_stat += Stat1_d(it->bytes_per_second, it->iterations);
+    max_heapbytes_used_stat += Stat1MinMax_d(it->max_heapbytes_used,
+                                             it->iterations);
+  }
+
+  // Get the data from the accumulator to BenchmarkRunData's.
+  mean_data->benchmark_name = reports[0].benchmark_name + "_mean";
+  mean_data->iterations = total_iters;
+  mean_data->real_accumulated_time = real_accumulated_time_stat.Sum();
+  mean_data->cpu_accumulated_time = cpu_accumulated_time_stat.Sum();
+  mean_data->bytes_per_second = bytes_per_second_stat.Mean();
+  mean_data->items_per_second = items_per_second_stat.Mean();
+  mean_data->max_heapbytes_used = max_heapbytes_used_stat.Max();
+
+  // Only add label to mean/stddev if it is same for all runs
+  mean_data->report_label = reports[0].report_label;
+  for (size_t i = 1; i < reports.size(); i++) {
+    if (reports[i].report_label != reports[0].report_label) {
+      mean_data->report_label = "";
+      break;
+    }
+  }
+
+  stddev_data->benchmark_name = reports[0].benchmark_name + "_stddev";
+  stddev_data->report_label = mean_data->report_label;
+  stddev_data->iterations = total_iters;
+  // We multiply by total_iters since PrintRunData expects a total time.
+  stddev_data->real_accumulated_time =
+      real_accumulated_time_stat.StdDev() * total_iters;
+  stddev_data->cpu_accumulated_time =
+      cpu_accumulated_time_stat.StdDev() * total_iters;
+  stddev_data->bytes_per_second = bytes_per_second_stat.StdDev();
+  stddev_data->items_per_second = items_per_second_stat.StdDev();
+  stddev_data->max_heapbytes_used = max_heapbytes_used_stat.StdDev();
+}
+
+std::string ConsoleReporter::PrintMemoryUsage(double bytes) {
+  if (!get_memory_usage || bytes < 0.0)
+    return "";
+
+  std::stringstream ss;
+  ss << " " << HumanReadableNumber(bytes) << "B peak-mem";
+  return ss.str();
+}
+
+bool ConsoleReporter::ReportContext(const BenchmarkContextData& context) {
+  name_field_width_ = context.name_field_width;
+
+  std::cout << "Benchmarking on " << context.num_cpus << " X "
+            << context.mhz_per_cpu << " MHz CPU"
+            << ((context.num_cpus > 1) ? "s" : "") << "\n";
+
+  int remainder_ms;
+  char time_buf[32];
+  std::cout << walltime::Print(walltime::Now(), "%Y/%m/%d-%H:%M:%S",
+                               true,   // use local timezone
+                               time_buf, &remainder_ms) << "\n"; 
+
+  // Show details of CPU model, caches, TLBs etc.
+//  if (!context.cpu_info.empty())
+//    std::cout << "CPU: " << context.cpu_info.c_str();
+
+  if (context.cpu_scaling_enabled) {
+    std::cerr << "CPU scaling is enabled: Benchmark timings may be noisy.\n";
+  }
+
+  int output_width = fprintf(stdout, "%s%-*s %10s %10s %10s\n",
+                             Prefix(), name_field_width_, "Benchmark",
+                             "Time(ns)", "CPU(ns)", "Iterations");
+  std::cout << std::string(output_width - 1, '-').c_str() << "\n";
+
+  return true;
+}
+
+void ConsoleReporter::ReportRuns(const std::vector<BenchmarkRunData>& reports) {
+  for (std::vector<BenchmarkRunData>::const_iterator it = reports.begin();
+       it != reports.end(); ++it) {
+    CHECK_EQ(reports[0].benchmark_name, it->benchmark_name);
+    PrintRunData(*it);
+  }
+
+  // We don't report aggregated data if there was a single run.
+  if (reports.size() < 2)
+    return;
+
+  BenchmarkRunData mean_data;
+  BenchmarkRunData stddev_data;
+  internal::ComputeStats(reports, &mean_data, &stddev_data);
+
+  // Output using PrintRun.
+  PrintRunData(mean_data);
+  PrintRunData(stddev_data);
+  fprintf(stdout, "\n");
+}
+
+void ConsoleReporter::PrintRunData(const BenchmarkRunData& result) {
+  // Format bytes per second
+  std::string rate;
+  if (result.bytes_per_second > 0) {
+    std::stringstream ss;
+    ss << " " << HumanReadableNumber(result.bytes_per_second) << "B/s";
+    rate = ss.str();
+  }
+
+  // Format items per second
+  std::string items;
+  if (result.items_per_second > 0) {
+    std::stringstream ss;
+    ss << " " << HumanReadableNumber(result.items_per_second) << " items/s";
+    items = ss.str();
+  }
+
+  ColorPrintf(COLOR_DEFAULT, "%s", Prefix());
+  ColorPrintf(COLOR_GREEN, "%-*s ",
+              name_field_width_, result.benchmark_name.c_str());
+  ColorPrintf(COLOR_YELLOW, "%10.0f %10.0f ",
+              (result.real_accumulated_time * 1e9) /
+                  (static_cast<double>(result.iterations)),
+              (result.cpu_accumulated_time * 1e9) /
+                  (static_cast<double>(result.iterations)));
+  ColorPrintf(COLOR_CYAN, "%10lld", result.iterations);
+  ColorPrintf(COLOR_DEFAULT, "%*s %s %s%s\n", 16, rate.c_str(), items.c_str(),
+              result.report_label.c_str(),
+              PrintMemoryUsage(result.max_heapbytes_used).c_str());
+}
+
+void MemoryUsage() {
+  //if (benchmark_mc) {
+  //  benchmark_mc->Reset();
+  //} else {
+    get_memory_usage = true;
+  //}
+}
+
+void UseRealTime() {
+  use_real_time = true;
+}
+
+void PrintUsageAndExit() {
+  fprintf(stdout, "benchmark [--benchmark_filter=<regex>]\n"
+                  "          [--benchmark_min_iters=<min_iters>]\n"
+                  "          [--benchmark_max_iters=<max_iters>]\n"
+                  "          [--benchmark_min_time=<min_time>]\n"
+//                "          [--benchmark_memory_usage]\n"
+                  "          [--benchmark_repetitions=<num_repetitions>]\n"
+                  "          [--color_print={true|false}]\n"
+                  "          [--v=<verbosity>]\n");
+  exit(0);
+}
+
+void ParseCommandLineFlags(int* argc, const char** argv) {
+  for (int i = 1; i < *argc; ++i) {
+    if (ParseStringFlag(argv[i], "benchmark_filter",
+                        &FLAGS_benchmark_filter) ||
+        ParseInt32Flag(argv[i], "benchmark_min_iters",
+                       &FLAGS_benchmark_min_iters) ||
+        ParseInt32Flag(argv[i], "benchmark_max_iters",
+                       &FLAGS_benchmark_max_iters) ||
+        ParseDoubleFlag(argv[i], "benchmark_min_time",
+                        &FLAGS_benchmark_min_time) ||
+        // TODO(dominic)
+//        ParseBoolFlag(argv[i], "gbenchmark_memory_usage",
+//                      &FLAGS_gbenchmark_memory_usage) ||
+        ParseInt32Flag(argv[i], "benchmark_repetitions",
+                       &FLAGS_benchmark_repetitions) ||
+        ParseBoolFlag(argv[i], "color_print", &FLAGS_color_print) ||
+        ParseInt32Flag(argv[i], "v", &FLAGS_v)) {
+      for (int j = i; j != *argc; ++j)
+        argv[j] = argv[j + 1];
+
+      --(*argc);
+      --i;
+    } else if (IsFlag(argv[i], "help"))
+      PrintUsageAndExit();
+  }
+}
+
+}  // end namespace internal
+
+// A clock that provides a fast mechanism to check if we're nearly done.
+class State::FastClock {
+ public:
+  enum Type { REAL_TIME, CPU_TIME };
+  explicit FastClock(Type type)
+      : type_(type), approx_time_(NowMicros()) {
+
+    sem_init(&bg_done_, 0, 0);
+    pthread_create(&bg_, NULL, &BGThreadWrapper, this);
+  }
+
+  ~FastClock() {
+    sem_post(&bg_done_);
+    pthread_join(bg_, NULL);
+    sem_destroy(&bg_done_);
+  }
+
+  // Returns true if the current time is guaranteed to be past "when_micros".
+  // This method is very fast.
+  inline bool HasReached(int64_t when_micros) {
+    return approx_time_ >= when_micros;
+    // NOTE: this is the same as we're dealing with an int64_t
+    //return (base::subtle::NoBarrier_Load(&approx_time_) >= when_micros);
+  }
+
+  // Returns the current time in microseconds past the epoch.
+  int64_t NowMicros() const {
+    double t = 0;
+    switch (type_) {
+      case REAL_TIME:
+        t = walltime::Now();
+        break;
+      case CPU_TIME:
+        t = MyCPUUsage() + ChildrenCPUUsage();
+        break;
+    }
+    return static_cast<int64_t>(t * 1e6);
+  }
+
+  // Reinitialize if necessary (since clock type may be change once benchmark
+  // function starts running - see UseRealTime).
+  void InitType(Type type) {
+    type_ = type;
+    approx_time_ = NowMicros();
+    // NOTE: This is the same barring a memory barrier
+    // base::subtle::Release_Store(&approx_time_, NowMicros());
+  }
+
+ private:
+  Type type_;
+  int64_t approx_time_;  // Last time measurement taken by bg_
+  pthread_t bg_;  // Background thread that updates last_time_ once every ms
+
+  sem_t bg_done_;
+
+  static void* BGThreadWrapper(void* that) {
+    ((FastClock*)that)->BGThread();
+    return NULL;
+  }
+
+  void BGThread() {
+    int done = 0;
+    do {
+      SleepForMicroseconds(1000);
+      approx_time_ = NowMicros();
+      // NOTE: same code but no memory barrier. think on it.
+      //base::subtle::Release_Store(&approx_time_, NowMicros());
+      sem_getvalue(&bg_done_, &done);
+    } while (done == 0);
+  }
+
+  DISALLOW_COPY_AND_ASSIGN(FastClock);
+};
+
+namespace internal {
+
+const int Benchmark::kNumCpuMarker;
+
+// Information kept per benchmark we may want to run
+struct Benchmark::Instance {
+  Instance()
+      : rangeXset(false), rangeX(kNoRange),
+        rangeYset(false), rangeY(kNoRange) {}
+
+  std::string name;
+  Benchmark* bm;
+  bool      rangeXset;
+  int       rangeX;
+  bool      rangeYset;
+  int       rangeY;
+  int       threads;    // Number of concurrent threads to use
+
+  bool multithreaded() const { return !bm->thread_counts_.empty(); }
+};
+
+}  // end namespace internal
+
+struct State::SharedState {
+  const internal::Benchmark::Instance* instance;
+  pthread_mutex_t mu;
+  int starting;  // Number of threads that have entered STARTING state
+  int stopping;  // Number of threads that have entered STOPPING state
+  int threads;   // Number of total threads that are running concurrently
+  internal::Benchmark::ThreadStats stats;
+  std::vector<internal::BenchmarkRunData> runs;  // accumulated runs
+  std::string label;
+
+  SharedState(const internal::Benchmark::Instance* b, int t)
+      : instance(b), starting(0), stopping(0), threads(t) {
+  }
+  DISALLOW_COPY_AND_ASSIGN(SharedState);
+};
+
+namespace internal {
+
+Benchmark::Benchmark(const char* name, BenchmarkFunction f)
+    : name_(name), function_(f) {
+  mutex_lock l(&benchmark_mutex);
+  if (families == nullptr)
+    families = new std::vector<Benchmark*>;
+  registration_index_ = families->size();
+  families->push_back(this);
+}
+
+Benchmark::~Benchmark() {
+  mutex_lock l(&benchmark_mutex);
+  CHECK((*families)[registration_index_] == this);
+  (*families)[registration_index_] = NULL;
+  // Shrink the vector if convenient.
+  while (!families->empty() && families->back() == NULL)
+    families->pop_back();
+}
+
+Benchmark* Benchmark::Arg(int x) {
+  mutex_lock l(&benchmark_mutex);
+  rangeX_.push_back(x);
+  return this;
+}
+
+Benchmark* Benchmark::Range(int start, int limit) {
+  std::vector<int> arglist;
+  AddRange(&arglist, start, limit, kRangeMultiplier);
+
+  mutex_lock l(&benchmark_mutex);
+  for (size_t i = 0; i < arglist.size(); ++i)
+    rangeX_.push_back(arglist[i]);
+  return this;
+}
+
+Benchmark* Benchmark::DenseRange(int start, int limit) {
+  CHECK_GE(start, 0);
+  CHECK_LE(start, limit);
+  mutex_lock l(&benchmark_mutex);
+  for (int arg = start; arg <= limit; ++arg)
+    rangeX_.push_back(arg);
+  return this;
+}
+
+Benchmark* Benchmark::ArgPair(int x, int y) {
+  mutex_lock l(&benchmark_mutex);
+  rangeX_.push_back(x);
+  rangeY_.push_back(y);
+  return this;
+}
+
+Benchmark* Benchmark::RangePair(int lo1, int hi1, int lo2, int hi2) {
+  std::vector<int> arglist1, arglist2;
+  AddRange(&arglist1, lo1, hi1, kRangeMultiplier);
+  AddRange(&arglist2, lo2, hi2, kRangeMultiplier);
+
+  mutex_lock l(&benchmark_mutex);
+  rangeX_.resize(arglist1.size());
+  std::copy(arglist1.begin(), arglist1.end(), rangeX_.begin());
+  rangeY_.resize(arglist2.size());
+  std::copy(arglist2.begin(), arglist2.end(), rangeY_.begin());
+  return this;
+}
+
+Benchmark* Benchmark::Apply(void (*custom_arguments)(Benchmark* benchmark)) {
+  custom_arguments(this);
+  return this;
+}
+
+Benchmark* Benchmark::Threads(int t) {
+  CHECK_GT(t, 0);
+  mutex_lock l(&benchmark_mutex);
+  thread_counts_.push_back(t);
+  return this;
+}
+
+Benchmark* Benchmark::ThreadRange(int min_threads, int max_threads) {
+  CHECK_GT(min_threads, 0);
+  CHECK_GE(max_threads, min_threads);
+
+  mutex_lock l(&benchmark_mutex);
+  AddRange(&thread_counts_, min_threads, max_threads, 2);
+  return this;
+}
+
+Benchmark* Benchmark::ThreadPerCpu() {
+  mutex_lock l(&benchmark_mutex);
+  thread_counts_.push_back(kNumCpuMarker);
+  return this;
+}
+
+void Benchmark::AddRange(std::vector<int>* dst, int lo, int hi, int mult) {
+  CHECK_GE(lo, 0);
+  CHECK_GE(hi, lo);
+
+  // Add "lo"
+  dst->push_back(lo);
+
+  // Now space out the benchmarks in multiples of "mult"
+  for (int32_t i = 1; i < std::numeric_limits<int32_t>::max()/mult; i *= mult) {
+    if (i >= hi) break;
+    if (i > lo)
+      dst->push_back(i);
+  }
+  // Add "hi" (if different from "lo")
+  if (hi != lo)
+    dst->push_back(hi);
+}
+
+std::vector<Benchmark::Instance> Benchmark::CreateBenchmarkInstances(
+    int rangeXindex, int rangeYindex) {
+  // Special list of thread counts to use when none are specified
+  std::vector<int> one_thread;
+  one_thread.push_back(1);
+
+  std::vector<Benchmark::Instance> instances;
+
+  const bool is_multithreaded = (!thread_counts_.empty());
+  const std::vector<int>* thread_counts =
+      (is_multithreaded ? &thread_counts_ : &one_thread);
+  for (size_t t = 0; t < thread_counts->size(); ++t) {
+    int num_threads = (*thread_counts)[t];
+    if (num_threads == kNumCpuMarker)
+      num_threads = NumCPUs();
+
+    Instance instance;
+    instance.name = name_;
+    instance.bm = this;
+    instance.threads = num_threads;
+
+    if (rangeXindex != kNoRange) {
+      instance.rangeX = rangeX_[rangeXindex];
+      instance.rangeXset = true;
+      AppendHumanReadable(instance.rangeX, &instance.name);
+    }
+    if (rangeYindex != kNoRange) {
+      instance.rangeY = rangeY_[rangeYindex];
+      instance.rangeYset = true;
+      AppendHumanReadable(instance.rangeY, &instance.name);
+    }
+
+    // Add the number of threads used to the name
+    if (is_multithreaded) {
+      std::stringstream ss;
+      ss << "/threads:" << instance.threads;
+      instance.name += ss.str();
+    }
+
+    instances.push_back(instance);
+  }
+
+  return instances;
+}
+
+// Extract the list of benchmark instances that match the specified
+// regular expression.
+void Benchmark::FindBenchmarks(const std::string& spec,
+                               std::vector<Instance>* benchmarks) {
+  // Make regular expression out of command-line flag
+  regex_t re;
+  int ec = regcomp(&re, spec.c_str(), REG_EXTENDED | REG_NOSUB);
+  if (ec != 0) {
+    size_t needed = regerror(ec, &re, NULL, 0);
+    char* errbuf = new char[needed];
+    regerror(ec, &re, errbuf, needed);
+    std::cerr << "Could not compile benchmark re: " << errbuf << "\n";
+    delete[] errbuf;
+    return;
+  }
+
+  mutex_lock l(&benchmark_mutex);
+  for (Benchmark* family : *families) {
+    if (family == nullptr) continue;  // Family was deleted
+
+    // Match against filter.
+    if (regexec(&re, family->name_.c_str(), 0, NULL, 0) != 0) {
+#ifdef DEBUG
+      std::cout << "Skipping " << family->name_ << "\n";
+#endif
+      continue;
+    }
+
+    std::vector<Benchmark::Instance> instances;
+    if (family->rangeX_.empty() && family->rangeY_.empty()) {
+      instances = family->CreateBenchmarkInstances(kNoRange, kNoRange);
+      benchmarks->insert(benchmarks->end(), instances.begin(), instances.end());
+    } else if (family->rangeY_.empty()) {
+      for (size_t x = 0; x < family->rangeX_.size(); ++x) {
+        instances = family->CreateBenchmarkInstances(x, kNoRange);
+        benchmarks->insert(benchmarks->end(),
+                           instances.begin(), instances.end());
+      }
+    } else {
+      for (size_t x = 0; x < family->rangeX_.size(); ++x) {
+        for (size_t y = 0; y < family->rangeY_.size(); ++y) {
+          instances = family->CreateBenchmarkInstances(x, y);
+          benchmarks->insert(benchmarks->end(),
+                             instances.begin(), instances.end());
+        }
+      }
+    }
+  }
+}
+
+void Benchmark::MeasureOverhead() {
+  State::FastClock clock(State::FastClock::CPU_TIME);
+  State::SharedState state(NULL, 1);
+  State runner(&clock, &state, 0);
+  while (runner.KeepRunning()) {}
+  overhead = state.runs[0].real_accumulated_time /
+      static_cast<double>(state.runs[0].iterations);
+#ifdef DEBUG
+  std::cout << "Per-iteration overhead for doing nothing: " << overhead << "\n";
+#endif
+}
+
+void Benchmark::RunInstance(const Instance& b, BenchmarkReporter* br) {
+  use_real_time = false;
+  running_benchmark = true;
+  // get_memory_usage = FLAGS_gbenchmark_memory_usage;
+  State::FastClock clock(State::FastClock::CPU_TIME);
+
+  // Initialize the test runners.
+  State::SharedState state(&b, b.threads);
+  {
+    std::unique_ptr<State> runners[b.threads];
+    // TODO: create thread objects
+    for (int i = 0; i < b.threads; ++i)
+      runners[i].reset(new State(&clock, &state, i));
+
+    // Run them all.
+    for (int i = 0; i < b.threads; ++i) {
+      State* r = runners[i].release();
+      if (b.multithreaded()) {
+        // TODO: start pthreads (member of state?) and set up thread local
+        // pointers to stats
+        //pool->Add(base::NewCallback(r, &State::Run));
+      } else {
+        pthread_setspecific(thread_stats_key, thread_stats);
+        r->Run();
+      }
+    }
+    if (b.multithreaded()) {
+      // TODO: join all the threads
+      //pool->JoinAll();
+    }
+  }
+/*
+  double mem_usage = 0;
+  if (get_memory_usage) {
+    // Measure memory usage
+    Notification mem_done;
+    BenchmarkRun mem_run;
+    BenchmarkRun::SharedState mem_shared(&b, 1);
+    mem_run.Init(&clock, &mem_shared, 0);
+    {
+      testing::MallocCounter mc(testing::MallocCounter::THIS_THREAD_ONLY);
+      benchmark_mc = &mc;
+      mem_run.Run(&mem_done);
+      mem_done.WaitForNotification();
+      benchmark_mc = NULL;
+      mem_usage = mc.PeakHeapGrowth();
+    }
+  }
+*/
+  running_benchmark = false;
+
+  for (internal::BenchmarkRunData& report : state.runs) {
+    double seconds = (use_real_time ? report.real_accumulated_time :
+                                      report.cpu_accumulated_time);
+    // TODO: add the thread index here?
+    report.benchmark_name = b.name;
+    report.report_label = state.label;
+    report.bytes_per_second = state.stats.bytes_processed / seconds;
+    report.items_per_second = state.stats.items_processed / seconds;
+    report.max_heapbytes_used = MeasurePeakHeapMemory(b);
+  }
+
+  br->ReportRuns(state.runs);
+}
+
+// Run the specified benchmark, measure its peak memory usage, and
+// return the peak memory usage.
+double Benchmark::MeasurePeakHeapMemory(const Instance& b) {
+  if (!get_memory_usage)
+    return 0.0;
+  double bytes = 0.0;
+ /*  TODO(dominich)
+  // Should we do multi-threaded runs?
+  const int num_threads = 1;
+  const int num_iters = 1;
+  {
+//    internal::MallocCounter mc(internal::MallocCounter::THIS_THREAD_ONLY);
+    running_benchmark = true;
+    timer_manager = new TimerManager(1, NULL);
+//    benchmark_mc = &mc;
+    timer_manager->StartTimer();
+
+    b.Run(num_iters);
+
+    running_benchmark = false;
+    delete timer_manager;
+    timer_manager = NULL;
+//    benchmark_mc = NULL;
+//    bytes = mc.PeakHeapGrowth();
+  }
+  */
+  return bytes;
+}
+
+}  // end namespace internal
+  
+State::State(FastClock* clock, SharedState* s, int t)
+    : thread_index(t),
+      state_(STATE_INITIAL),
+      clock_(clock),
+      shared_(s),
+      iterations_(0),
+      start_cpu_(0.0),
+      start_time_(0.0),
+      stop_time_micros_(0.0),
+      start_pause_(0.0),
+      pause_time_(0.0),
+      total_iterations_(0),
+      interval_micros_(
+          static_cast<int64_t>(1e6 * FLAGS_benchmark_min_time /
+                               FLAGS_benchmark_repetitions)) {
+}
+
+bool State::KeepRunning() {
+  // Fast path
+  if (!clock_->HasReached(stop_time_micros_ + pause_time_)) {
+    ++iterations_;
+    return true;
+  }
+
+  switch(state_) {
+    case STATE_INITIAL: return StartRunning();
+    case STATE_STARTING: CHECK(false); return true;
+    case STATE_RUNNING: return FinishInterval();
+    case STATE_STOPPING: return MaybeStop();
+    case STATE_STOPPED: CHECK(false); return true;
+  }
+  CHECK(false);
+  return false;
+}
+
+void State::PauseTiming() {
+  start_pause_ = walltime::Now();
+}
+
+void State::ResumeTiming() {
+  pause_time_ += walltime::Now() - start_pause_;
+}
+
+void State::SetBytesProcessed(int64_t bytes) {
+  CHECK_EQ(STATE_STOPPED, state_);
+  mutex_lock l(&shared_->mu);
+  internal::Benchmark::ThreadStats* thread_stats =
+      (internal::Benchmark::ThreadStats*) pthread_getspecific(thread_stats_key);
+  thread_stats->bytes_processed = bytes;
+}
+
+void State::SetItemsProcessed(int64_t items) {
+  CHECK_EQ(STATE_STOPPED, state_);
+  mutex_lock l(&shared_->mu);
+  internal::Benchmark::ThreadStats* thread_stats =
+      (internal::Benchmark::ThreadStats*) pthread_getspecific(thread_stats_key);
+  thread_stats->items_processed = items;
+}
+
+void State::SetLabel(const std::string& label) {
+  CHECK_EQ(STATE_STOPPED, state_);
+  mutex_lock l(&shared_->mu);
+  shared_->label = label;
+}
+
+int State::range_x() const {
+  CHECK(shared_->instance->rangeXset);
+  /*
+  <<
+      "Failed to get range_x as it was not set. Did you register your "
+      "benchmark with a range parameter?";
+      */
+  return shared_->instance->rangeX;
+}
+
+int State::range_y() const {
+  CHECK(shared_->instance->rangeYset);
+ /* <<
+      "Failed to get range_y as it was not set. Did you register your "
+      "benchmark with a range parameter?";
+      */
+  return shared_->instance->rangeY;
+}
+
+bool State::StartRunning() {
+  {
+    mutex_lock l(&shared_->mu);
+    CHECK_EQ(state_, STATE_INITIAL);
+    state_ = STATE_STARTING;
+    is_continuation_ = false;
+    CHECK_LT(shared_->starting, shared_->threads);
+    ++shared_->starting;
+    if (shared_->starting == shared_->threads) {
+      // Last thread to start.
+      clock_->InitType(
+          use_real_time ? FastClock::REAL_TIME : FastClock::CPU_TIME);
+    } else {
+      // Wait for others.
+      // TODO(dominic): semaphore!
+      // while (pthread_getsemaphore(shared_->starting_sem_) !=
+      // shared_->threads) { }
+      //shared_->mu.Await(base::Condition(this, &State::AllStarting));
+    }
+    CHECK_EQ(state_, STATE_STARTING);
+    state_ = STATE_RUNNING;
+  }
+  NewInterval();
+  return true;
+}
+
+bool State::AllStarting() {
+  CHECK_LE(shared_->starting, shared_->threads);
+  return shared_->starting == shared_->threads;
+}
+
+void State::NewInterval() {
+  stop_time_micros_ = clock_->NowMicros() + interval_micros_;
+  if (!is_continuation_) {
+#ifdef DEBUG
+    std::cout << "Starting new interval; stopping in " << interval_micros_
+              << "\n";
+#endif
+    iterations_ = 0;
+    pause_time_ = 0;
+    start_cpu_ = MyCPUUsage() + ChildrenCPUUsage();
+    start_time_ = walltime::Now();
+  } else {
+#ifdef DEBUG
+    std::cout << "Continuing interval; stopping in " << interval_micros_
+              << "\n";
+#endif
+  }
+}
+
+bool State::FinishInterval() {
+  if (iterations_ < FLAGS_benchmark_min_iters / FLAGS_benchmark_repetitions &&
+      interval_micros_ < 5000000) {
+    interval_micros_ *= 2;
+#ifdef DEBUG
+    std::cout << "Interval was too short; trying again for "
+              << interval_micros_ << " useconds.\n";
+#endif
+    is_continuation_ = false;
+    NewInterval();
+    return true;
+  }
+
+  internal::BenchmarkRunData data;
+  data.thread_index = thread_index;
+  data.iterations = iterations_;
+  data.thread_index = thread_index;
+
+  const double accumulated_time = walltime::Now() - start_time_;
+  const double total_overhead = 0.0; // TODO: overhead * iterations_;
+  CHECK_LT(pause_time_, accumulated_time);
+  CHECK_LT(pause_time_ + total_overhead, accumulated_time);
+  data.real_accumulated_time =
+      accumulated_time - (pause_time_ + total_overhead);
+  data.cpu_accumulated_time = (MyCPUUsage() + ChildrenCPUUsage()) - start_cpu_;
+  total_iterations_ += iterations_;
+
+  bool keep_going = false;
+  {
+    mutex_lock l(&shared_->mu);
+    if (is_continuation_)
+      shared_->runs.back() = data;
+    else
+      shared_->runs.push_back(data);
+    keep_going = RunAnotherInterval();
+    if (!keep_going) {
+      ++shared_->stopping;
+      if (shared_->stopping < shared_->threads) {
+        // Other threads are still running, so continue running but without
+        // timing to present an expected background load to the other threads.
+        state_ = STATE_STOPPING;
+        keep_going = true;
+      } else {
+        state_ = STATE_STOPPED;
+      }
+    }
+  }
+
+  if (state_ == STATE_RUNNING) {
+    is_continuation_ = true;
+    NewInterval();
+  }
+  return keep_going;
+}
+
+bool State::RunAnotherInterval() const {
+  if (total_iterations_ < FLAGS_benchmark_min_iters)
+    return true;
+  if (total_iterations_ > FLAGS_benchmark_max_iters)
+    return false;
+  if (static_cast<int>(shared_->runs.size()) >= FLAGS_benchmark_repetitions)
+    return false;
+  return true;
+}
+
+bool State::MaybeStop() {
+  mutex_lock l(&shared_->mu);
+  if (shared_->stopping < shared_->threads) {
+    CHECK_EQ(state_, STATE_STOPPING);
+    return true;
+  }
+  state_ = STATE_STOPPED;
+  return false;
+}
+
+void State::Run() {
+  internal::Benchmark::ThreadStats* thread_stats =
+      (internal::Benchmark::ThreadStats*) pthread_getspecific(thread_stats_key);
+  thread_stats->Reset();
+  shared_->instance->bm->function_(*this);
+  {
+    mutex_lock l(&shared_->mu);
+    shared_->stats.Add(*thread_stats);
+  }
+}
+
+namespace internal {
+
+void RunMatchingBenchmarks(const std::string& spec,
+                           BenchmarkReporter* reporter) {
+  CHECK(reporter != NULL);
+  if (spec.empty()) return;
+
+  std::vector<internal::Benchmark::Instance> benchmarks;
+  internal::Benchmark::FindBenchmarks(spec, &benchmarks);
+
+  // Determine the width of the name field using a minimum width of 10.
+  // Also determine max number of threads needed.
+  int name_field_width = 10;
+  for (const internal::Benchmark::Instance& benchmark : benchmarks) {
+    // Add width for _stddev and threads:XX
+    if (benchmark.threads > 1 && FLAGS_benchmark_repetitions > 1) {
+      name_field_width = std::max<int>(name_field_width,
+                                       benchmark.name.size() + 17);
+    } else if (benchmark.threads> 1) {
+      name_field_width = std::max<int>(name_field_width,
+                                       benchmark.name.size() + 10);
+    } else if (FLAGS_benchmark_repetitions > 1) {
+      name_field_width = std::max<int>(name_field_width,
+                                       benchmark.name.size() + 7);
+    } else {
+      name_field_width = std::max<int>(name_field_width,
+                                       benchmark.name.size());
+    }
+  }
+
+  // Print header here
+  BenchmarkContextData context;
+  context.num_cpus = NumCPUs();
+  context.mhz_per_cpu =  CyclesPerSecond() / 1000000.0f;
+//  context.cpu_info = base::CompactCPUIDInfoString();
+  context.cpu_scaling_enabled = CpuScalingEnabled();
+  context.name_field_width = name_field_width;
+
+  if (reporter->ReportContext(context)) {
+    for (internal::Benchmark::Instance& benchmark : benchmarks) {
+      //std::unique_ptr<thread::ThreadPool> pool;
+      //if (benchmark.threads > 0) {
+      //  pool = new thread::ThreadPool(benchmark.threads);
+      //  pool->StartWorkers();
+      //}
+      Benchmark::RunInstance(/*pool, */benchmark, reporter);
+    }
+  }
+}
+
+void FindMatchingBenchmarkNames(const std::string& spec,
+                                std::vector<std::string>* benchmark_names) {
+  if (spec.empty()) return;
+
+  std::vector<internal::Benchmark::Instance> benchmarks;
+  internal::Benchmark::FindBenchmarks(spec, &benchmarks);
+  std::transform(benchmarks.begin(), benchmarks.end(), benchmark_names->begin(),
+                 [] (const internal::Benchmark::Instance& b) { return b.name; } );
+}
+
+}  // end namespace internal
+
+void RunSpecifiedBenchmarks() {
+  std::string spec = FLAGS_benchmark_filter;
+  if (spec.empty() || spec == "all")
+    spec = ".";         // Regexp that matches all benchmarks
+  internal::ConsoleReporter default_reporter;
+  internal::RunMatchingBenchmarks(spec, &default_reporter);
+}
+
+void Initialize(int* argc, const char** argv) {
+  //AtomicOps_Internalx86CPUFeaturesInit();
+  pthread_mutex_init(&benchmark_mutex, nullptr);
+  pthread_key_create(&thread_stats_key, DeleteThreadStats);
+  thread_stats = new internal::Benchmark::ThreadStats();
+  walltime::Initialize();
+  internal::Benchmark::MeasureOverhead();
+  internal::ParseCommandLineFlags(argc, argv); 
+}
+
+
+}  // end namespace benchmark
diff --git a/src/colorprint.cc b/src/colorprint.cc
new file mode 100644
index 00000000..42d69cb6
--- /dev/null
+++ b/src/colorprint.cc
@@ -0,0 +1,82 @@
+#include "colorprint.h"
+
+#include <stdarg.h>
+
+#include "commandlineflags.h"
+
+DECLARE_bool(color_print);
+
+namespace {
+#ifdef OS_WINDOWS
+typedef WORD PlatformColorCode;
+#else
+typedef const char* PlatformColorCode;
+#endif
+
+PlatformColorCode GetPlatformColorCode(LogColor color) {
+#ifdef OS_WINDOWS
+  switch (color) {
+    case COLOR_RED:     return FOREGROUND_RED;
+    case COLOR_GREEN:   return FOREGROUND_GREEN;
+    case COLOR_YELLOW:  return FOREGROUND_RED | FOREGROUND_GREEN;
+    case COLOR_BLUE:    return FOREGROUND_BLUE;
+    case COLOR_MAGENTA: return FOREGROUND_BLUE | FOREGROUND_RED;
+    case COLOR_CYAN:    return FOREGROUND_BLUE | FOREGROUND_GREEN;
+    case COLOR_WHITE:   // fall through to default
+    default:            return 0;
+  }
+#else
+  switch (color) {
+    case COLOR_RED:     return "1";
+    case COLOR_GREEN:   return "2";
+    case COLOR_YELLOW:  return "3";
+    case COLOR_BLUE:    return "4";
+    case COLOR_MAGENTA: return "5";
+    case COLOR_CYAN:    return "6";
+    case COLOR_WHITE:   return "7";
+    default:            return NULL;
+  };
+#endif
+}
+}  // end namespace
+
+void ColorPrintf(LogColor color, const char* fmt, ...) {
+  va_list args;
+  va_start(args, fmt);
+
+  if (!FLAGS_color_print) {
+    vprintf(fmt, args);
+    va_end(args);
+    return;
+  }
+
+#ifdef OS_WINDOWS
+  const HANDLE stdout_handle = GetStdHandle(STD_OUTPUT_HANDLE);
+  
+  // Gets the current text color.
+  CONSOLE_SCREEN_BUFFER_INFO buffer_info;
+  GetConsoleScreenBufferInfo(stdout_handle, &buffer_info);
+  const WORD old_color_attrs = buffer_info.wAttributes;
+
+  // We need to flush the stream buffers into the console before each
+  // SetConsoleTextAttribute call lest it affect the text that is already
+  // printed but has not yet reached the console.
+  fflush(stdout);
+  SetConsoleTextAttribute(stdout_handle,
+                          GetPlatformColorCode(color) | FOREGROUND_INTENSITY);
+  vprintf(fmt, args);
+
+  fflush(stdout);
+  // Restores the text color.
+  SetConsoleTextAttribute(stdout_handle, old_color_attrs);
+#else
+  const char* color_code = GetPlatformColorCode(color);
+  if (color_code)
+    fprintf(stdout, "\033[0;3%sm", color_code);
+  vprintf(fmt, args);
+  printf("\033[m");  // Resets the terminal to default.
+#endif
+  va_end(args);
+}
+
+
diff --git a/src/colorprint.h b/src/colorprint.h
new file mode 100644
index 00000000..5789c2e7
--- /dev/null
+++ b/src/colorprint.h
@@ -0,0 +1,17 @@
+#ifndef BENCHMARK_COLORPRINT_H_
+#define BENCHMARK_COLORPRINT_H_
+
+enum LogColor {
+  COLOR_DEFAULT,
+  COLOR_RED,
+  COLOR_GREEN,
+  COLOR_YELLOW,
+  COLOR_BLUE,
+  COLOR_MAGENTA,
+  COLOR_CYAN,
+  COLOR_WHITE
+};
+
+void ColorPrintf(LogColor color, const char* fmt, ...);
+
+#endif  // BENCHMARK_COLORPRINT_H_
diff --git a/src/commandlineflags.cc b/src/commandlineflags.cc
new file mode 100644
index 00000000..331b8ff5
--- /dev/null
+++ b/src/commandlineflags.cc
@@ -0,0 +1,213 @@
+#include "commandlineflags.h"
+
+#include <string.h>
+
+#include <iostream>
+#include <limits>
+
+namespace benchmark {
+// Parses 'str' for a 32-bit signed integer.  If successful, writes
+// the result to *value and returns true; otherwise leaves *value
+// unchanged and returns false.
+bool ParseInt32(const std::string& src_text, const char* str, int32_t* value) {
+  // Parses the environment variable as a decimal integer.
+  char* end = NULL;
+  const long long_value = strtol(str, &end, 10);  // NOLINT
+
+  // Has strtol() consumed all characters in the string?
+  if (*end != '\0') {
+    // No - an invalid character was encountered.
+    std::cerr << src_text << " is expected to be a 32-bit integer, "
+              << "but actually has value \"" << str << "\".\n";
+    return false;
+  }
+
+  // Is the parsed value in the range of an Int32?
+  const int32_t result = static_cast<int32_t>(long_value);
+  if (long_value == std::numeric_limits<long>::max() ||
+      long_value == std::numeric_limits<long>::min() ||
+      // The parsed value overflows as a long.  (strtol() returns
+      // LONG_MAX or LONG_MIN when the input overflows.)
+      result != long_value
+      // The parsed value overflows as an Int32.
+      ) {
+    std::cerr << src_text << " is expected to be a 32-bit integer, "
+              << "but actually has value \"" << str << "\", "
+              << "which overflows.\n";
+    return false;
+  }
+
+  *value = result;
+  return true;
+}
+
+// Parses 'str' for a double.  If successful, writes the result to *value and
+// returns true; otherwise leaves *value unchanged and returns false.
+bool ParseDouble(const std::string& src_text, const char* str, double* value) {
+  // Parses the environment variable as a decimal integer.
+  char* end = NULL;
+  const double double_value = strtod(str, &end);  // NOLINT
+
+  // Has strtol() consumed all characters in the string?
+  if (*end != '\0') {
+    // No - an invalid character was encountered.
+    std::cerr << src_text << " is expected to be a double, "
+              << "but actually has value \"" << str << "\".\n";
+    return false;
+  }
+
+  *value = double_value;
+  return true;
+}
+
+
+inline const char* GetEnv(const char* name) {
+#if GTEST_OS_WINDOWS_MOBILE
+  // We are on Windows CE, which has no environment variables.
+  return NULL;
+#elif defined(__BORLANDC__) || defined(__SunOS_5_8) || defined(__SunOS_5_9)
+  // Environment variables which we programmatically clear will be set to the
+  // empty string rather than unset (NULL).  Handle that case.
+  const char* const env = getenv(name);
+  return (env != NULL && env[0] != '\0') ? env : NULL;
+#else
+  return getenv(name);
+#endif
+}
+
+// Returns the name of the environment variable corresponding to the
+// given flag.  For example, FlagToEnvVar("foo") will return
+// "BENCHMARK_FOO" in the open-source version.
+static std::string FlagToEnvVar(const char* flag) {
+  const std::string flag_str(flag);
+
+  std::string env_var;
+  for (size_t i = 0; i != flag_str.length(); ++i)
+    env_var += ::toupper(flag_str.c_str()[i]);
+
+  return "BENCHMARK_" + env_var;
+}
+
+// Reads and returns the Boolean environment variable corresponding to
+// the given flag; if it's not set, returns default_value.
+//
+// The value is considered true iff it's not "0".
+bool BoolFromEnv(const char* flag, bool default_value) {
+  const std::string env_var = FlagToEnvVar(flag);
+  const char* const string_value = GetEnv(env_var.c_str());
+  return string_value == NULL ?
+      default_value : strcmp(string_value, "0") != 0;
+}
+
+// Reads and returns a 32-bit integer stored in the environment
+// variable corresponding to the given flag; if it isn't set or
+// doesn't represent a valid 32-bit integer, returns default_value.
+int32_t Int32FromEnv(const char* flag, int32_t default_value) {
+  const std::string env_var = FlagToEnvVar(flag);
+  const char* const string_value = GetEnv(env_var.c_str());
+  if (string_value == NULL) {
+    // The environment variable is not set.
+    return default_value;
+  }
+
+  int32_t result = default_value;
+  if (!ParseInt32(std::string("Environment variable ") + env_var,
+                  string_value, &result)) {
+    std::cout << "The default value " << default_value << " is used.\n";
+    return default_value;
+  }
+
+  return result;
+}
+
+// Reads and returns the string environment variable corresponding to
+// the given flag; if it's not set, returns default_value.
+const char* StringFromEnv(const char* flag, const char* default_value) {
+  const std::string env_var = FlagToEnvVar(flag);
+  const char* const value = GetEnv(env_var.c_str());
+  return value == NULL ? default_value : value;
+}
+
+// Parses a string as a command line flag.  The string should have
+// the format "--flag=value".  When def_optional is true, the "=value"
+// part can be omitted.
+//
+// Returns the value of the flag, or NULL if the parsing failed.
+const char* ParseFlagValue(const char* str,
+                           const char* flag,
+                           bool def_optional) {
+  // str and flag must not be NULL.
+  if (str == NULL || flag == NULL) return NULL;
+
+  // The flag must start with "--". 
+  const std::string flag_str = std::string("--") + std::string(flag);
+  const size_t flag_len = flag_str.length();
+  if (strncmp(str, flag_str.c_str(), flag_len) != 0) return NULL;
+
+  // Skips the flag name.
+  const char* flag_end = str + flag_len;
+
+  // When def_optional is true, it's OK to not have a "=value" part.
+  if (def_optional && (flag_end[0] == '\0'))
+    return flag_end;
+
+  // If def_optional is true and there are more characters after the
+  // flag name, or if def_optional is false, there must be a '=' after
+  // the flag name.
+  if (flag_end[0] != '=') return NULL;
+
+  // Returns the string after "=".
+  return flag_end + 1;
+}
+
+bool ParseBoolFlag(const char* str, const char* flag, bool* value) {
+  // Gets the value of the flag as a string.
+  const char* const value_str = ParseFlagValue(str, flag, true);
+
+  // Aborts if the parsing failed.
+  if (value_str == NULL) return false;
+
+  // Converts the string value to a bool.
+  *value = !(*value_str == '0' || *value_str == 'f' || *value_str == 'F');
+  return true;
+}
+
+bool ParseInt32Flag(const char* str, const char* flag, int32_t* value) {
+  // Gets the value of the flag as a string.
+  const char* const value_str = ParseFlagValue(str, flag, false);
+
+  // Aborts if the parsing failed.
+  if (value_str == NULL) return false;
+
+  // Sets *value to the value of the flag.
+  return ParseInt32(std::string("The value of flag --") + flag,
+                    value_str, value);
+}
+
+bool ParseDoubleFlag(const char* str, const char* flag, double* value) {
+  // Gets the value of the flag as a string.
+  const char* const value_str = ParseFlagValue(str, flag, false);
+
+  // Aborts if the parsing failed.
+  if (value_str == NULL) return false;
+
+  // Sets *value to the value of the flag.
+  return ParseDouble(std::string("The value of flag --") + flag,
+                     value_str, value);
+}
+
+bool ParseStringFlag(const char* str, const char* flag, std::string* value) {
+  // Gets the value of the flag as a string.
+  const char* const value_str = ParseFlagValue(str, flag, false);
+
+  // Aborts if the parsing failed.
+  if (value_str == NULL) return false;
+
+  *value = value_str;
+  return true;
+}
+
+bool IsFlag(const char* str, const char* flag) {
+  return (ParseFlagValue(str, flag, true) != NULL);
+}
+}  // end namespace benchmark
diff --git a/src/commandlineflags.h b/src/commandlineflags.h
new file mode 100644
index 00000000..056d9fc0
--- /dev/null
+++ b/src/commandlineflags.h
@@ -0,0 +1,79 @@
+#ifndef BENCHMARK_COMMANDLINEFLAGS_H_
+#define BENCHMARK_COMMANDLINEFLAGS_H_
+
+#include <stdint.h>
+
+#include <string>
+
+// Macro for referencing flags.
+#define FLAG(name) FLAGS_##name
+
+// Macros for declaring flags.
+#define DECLARE_bool(name) extern bool FLAG(name)
+#define DECLARE_int32(name) extern int32_t FLAG(name)
+#define DECLARE_int64(name) extern int64_t FLAG(name)
+#define DECLARE_double(name) extern double FLAG(name)
+#define DECLARE_string(name) extern std::string FLAG(name)
+
+// Macros for defining flags.
+#define DEFINE_bool(name, default_val, doc) bool FLAG(name) = (default_val)
+#define DEFINE_int32(name, default_val, doc) int32_t FLAG(name) = (default_val)
+#define DEFINE_int64(name, default_val, doc) int64_t FLAG(name) = (default_val)
+#define DEFINE_double(name, default_val, doc) double FLAG(name) = (default_val)
+#define DEFINE_string(name, default_val, doc) \
+    std::string FLAG(name) = (default_val)
+
+namespace benchmark {
+
+// Parses 'str' for a 32-bit signed integer.  If successful, writes the result
+// to *value and returns true; otherwise leaves *value unchanged and returns
+// false.
+bool ParseInt32(const std::string& src_text, const char* str, int32_t* value);
+
+// Parses a bool/Int32/string from the environment variable
+// corresponding to the given Google Test flag.
+bool BoolFromEnv(const char* flag, bool default_val);
+int32_t Int32FromEnv(const char* flag, int32_t default_val);
+double DoubleFromEnv(const char* flag, double default_val);
+const char* StringFromEnv(const char* flag, const char* default_val);
+
+// Parses a string for a bool flag, in the form of either
+// "--flag=value" or "--flag".
+//
+// In the former case, the value is taken as true as long as it does
+// not start with '0', 'f', or 'F'.
+//
+// In the latter case, the value is taken as true.
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+bool ParseBoolFlag(const char* str, const char* flag, bool* value);
+
+// Parses a string for an Int32 flag, in the form of
+// "--flag=value".
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+bool ParseInt32Flag(const char* str, const char* flag, int32_t* value);
+
+// Parses a string for a Double flag, in the form of
+// "--flag=value".
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+bool ParseDoubleFlag(const char* str, const char* flag, double* value);
+
+// Parses a string for a string flag, in the form of
+// "--flag=value".
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+bool ParseStringFlag(const char* str, const char* flag, std::string* value);
+
+// Returns true if the string matches the flag.
+bool IsFlag(const char* str, const char* flag);
+
+}  // end namespace gbenchmark
+
+#endif  // BENCHMARK_COMMANDLINEFLAGS_H_
+
diff --git a/src/cycleclock.h b/src/cycleclock.h
new file mode 100644
index 00000000..d5ba314f
--- /dev/null
+++ b/src/cycleclock.h
@@ -0,0 +1,129 @@
+// ----------------------------------------------------------------------
+// CycleClock
+//    A CycleClock tells you the current time in Cycles.  The "time"
+//    is actually time since power-on.  This is like time() but doesn't
+//    involve a system call and is much more precise.
+//
+// NOTE: Not all cpu/platform/kernel combinations guarantee that this
+// clock increments at a constant rate or is synchronized across all logical
+// cpus in a system.
+//
+// If you need the above guarantees, please consider using a different
+// API. There are efforts to provide an interface which provides a millisecond
+// granularity and implemented as a memory read. A memory read is generally
+// cheaper than the CycleClock for many architectures.
+//
+// Also, in some out of order CPU implementations, the CycleClock is not
+// serializing. So if you're trying to count at cycles granularity, your
+// data might be inaccurate due to out of order instruction execution.
+// ----------------------------------------------------------------------
+
+#ifndef BENCHMARK_CYCLECLOCK_H_
+#define BENCHMARK_CYCLECLOCK_H_
+
+#include <stdint.h>
+
+#if defined(OS_MACOSX)
+# include <mach/mach_time.h>
+#endif
+// For MSVC, we want to use '_asm rdtsc' when possible (since it works
+// with even ancient MSVC compilers), and when not possible the
+// __rdtsc intrinsic, declared in <intrin.h>.  Unfortunately, in some
+// environments, <windows.h> and <intrin.h> have conflicting
+// declarations of some other intrinsics, breaking compilation.
+// Therefore, we simply declare __rdtsc ourselves. See also
+// http://connect.microsoft.com/VisualStudio/feedback/details/262047
+#if defined(COMPILER_MSVC) && !defined(_M_IX86)
+extern "C" uint64_t __rdtsc();
+#pragma intrinsic(__rdtsc)
+#endif
+#include <sys/time.h>
+
+// NOTE: only i386 and x86_64 have been well tested.
+// PPC, sparc, alpha, and ia64 are based on
+//    http://peter.kuscsik.com/wordpress/?p=14
+// with modifications by m3b.  See also
+//    https://setisvn.ssl.berkeley.edu/svn/lib/fftw-3.0.1/kernel/cycle.h
+struct CycleClock {
+  // This should return the number of cycles since power-on.  Thread-safe.
+  static inline int64_t Now() {
+#if defined(OS_MACOSX)
+    // this goes at the top because we need ALL Macs, regardless of
+    // architecture, to return the number of "mach time units" that
+    // have passed since startup.  See sysinfo.cc where
+    // InitializeSystemInfo() sets the supposed cpu clock frequency of
+    // macs to the number of mach time units per second, not actual
+    // CPU clock frequency (which can change in the face of CPU
+    // frequency scaling).  Also note that when the Mac sleeps, this
+    // counter pauses; it does not continue counting, nor does it
+    // reset to zero.
+    return mach_absolute_time();
+#elif defined(__i386__)
+    int64_t ret;
+    __asm__ volatile ("rdtsc" : "=A" (ret) );
+    return ret;
+#elif defined(__x86_64__) || defined(__amd64__)
+    uint64_t low, high;
+    __asm__ volatile ("rdtsc" : "=a" (low), "=d" (high));
+    return (high << 32) | low;
+#elif defined(__powerpc__) || defined(__ppc__)
+    // This returns a time-base, which is not always precisely a cycle-count.
+    int64_t tbl, tbu0, tbu1;
+    asm("mftbu %0" : "=r" (tbu0));
+    asm("mftb  %0" : "=r" (tbl));
+    asm("mftbu %0" : "=r" (tbu1));
+    tbl &= -static_cast<int64>(tbu0 == tbu1);
+    // high 32 bits in tbu1; low 32 bits in tbl  (tbu0 is garbage)
+    return (tbu1 << 32) | tbl;
+#elif defined(__sparc__)
+    int64_t tick;
+    asm(".byte 0x83, 0x41, 0x00, 0x00");
+    asm("mov   %%g1, %0" : "=r" (tick));
+    return tick;
+#elif defined(__ia64__)
+    int64_t itc;
+    asm("mov %0 = ar.itc" : "=r" (itc));
+    return itc;
+#elif defined(COMPILER_MSVC) && defined(_M_IX86)
+    // Older MSVC compilers (like 7.x) don't seem to support the
+    // __rdtsc intrinsic properly, so I prefer to use _asm instead
+    // when I know it will work.  Otherwise, I'll use __rdtsc and hope
+    // the code is being compiled with a non-ancient compiler.
+    _asm rdtsc
+#elif defined(COMPILER_MSVC)
+    return __rdtsc();
+#elif defined(ARMV3)
+#if defined(ARMV6)  // V6 is the earliest arch that has a standard cyclecount
+    uint32_t pmccntr;
+    uint32_t pmuseren;
+    uint32_t pmcntenset;
+    // Read the user mode perf monitor counter access permissions.
+    asm("mrc p15, 0, %0, c9, c14, 0" : "=r" (pmuseren));
+    if (pmuseren & 1) {  // Allows reading perfmon counters for user mode code.
+      asm("mrc p15, 0, %0, c9, c12, 1" : "=r" (pmcntenset));
+      if (pmcntenset & 0x80000000ul) {  // Is it counting?
+        asm("mrc p15, 0, %0, c9, c13, 0" : "=r" (pmccntr));
+        // The counter is set up to count every 64th cycle
+        return static_cast<int64>(pmccntr) * 64;  // Should optimize to << 6
+      }
+    }
+#endif
+    struct timeval tv;
+    gettimeofday(&tv, NULL);
+    return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
+#elif defined(__mips__)
+    // mips apparently only allows rdtsc for superusers, so we fall
+    // back to gettimeofday.  It's possible clock_gettime would be better.
+    struct timeval tv;
+    gettimeofday(&tv, NULL);
+    return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
+#else
+// The soft failover to a generic implementation is automatic only for ARM.
+// For other platforms the developer is expected to make an attempt to create
+// a fast implementation and use generic version if nothing better is available.
+#error You need to define CycleTimer for your OS and CPU
+#endif
+  }
+};
+
+#endif  // BENCHMARK_CYCLECLOCK_H_
diff --git a/src/macros.h b/src/macros.h
new file mode 100644
index 00000000..b4703282
--- /dev/null
+++ b/src/macros.h
@@ -0,0 +1,110 @@
+#ifndef BENCHMARK_MACROS_H_
+#define BENCHMARK_MACROS_H_
+
+#include <assert.h>
+
+#define DISALLOW_COPY_AND_ASSIGN(TypeName) \
+  TypeName(const TypeName&);               \
+  void operator=(const TypeName&);
+
+// The arraysize(arr) macro returns the # of elements in an array arr.
+// The expression is a compile-time constant, and therefore can be
+// used in defining new arrays, for example.  If you use arraysize on
+// a pointer by mistake, you will get a compile-time error.
+//
+// One caveat is that, for C++03, arraysize() doesn't accept any array of
+// an anonymous type or a type defined inside a function.  In these rare
+// cases, you have to use the unsafe ARRAYSIZE() macro below.  This is
+// due to a limitation in C++03's template system.  The limitation has
+// been removed in C++11.
+
+// This template function declaration is used in defining arraysize.
+// Note that the function doesn't need an implementation, as we only
+// use its type.
+template <typename T, size_t N>
+char (&ArraySizeHelper(T (&array)[N]))[N];
+
+// That gcc wants both of these prototypes seems mysterious. VC, for
+// its part, can't decide which to use (another mystery). Matching of
+// template overloads: the final frontier.
+#ifndef COMPILER_MSVC
+template <typename T, size_t N>
+char (&ArraySizeHelper(const T (&array)[N]))[N];
+#endif
+
+#define arraysize(array) (sizeof(ArraySizeHelper(array)))
+
+// The STATIC_ASSERT macro can be used to verify that a compile time
+// expression is true. For example, you could use it to verify the
+// size of a static array:
+//
+//   STATIC_ASSERT(ARRAYSIZE(content_type_names) == CONTENT_NUM_TYPES,
+//                  content_type_names_incorrect_size);
+//
+// or to make sure a struct is smaller than a certain size:
+//
+//   STATIC_ASSERT(sizeof(foo) < 128, foo_too_large);
+//
+// The second argument to the macro is the name of the variable. If
+// the expression is false, most compilers will issue a warning/error
+// containing the name of the variable.
+
+template <bool>
+struct StaticAssert {
+};
+
+#define STATIC_ASSERT(expr, msg) \
+  typedef StaticAssert<(bool(expr))> msg[bool(expr) ? 1 : -1]
+
+// Implementation details of STATIC_ASSERT:
+//
+// - STATIC_ASSERT works by defining an array type that has -1
+//   elements (and thus is invalid) when the expression is false.
+//
+// - The simpler definition
+//
+//     #define STATIC_ASSERT(expr, msg) typedef char msg[(expr) ? 1 : -1]
+//
+//   does not work, as gcc supports variable-length arrays whose sizes
+//   are determined at run-time (this is gcc's extension and not part
+//   of the C++ standard).  As a result, gcc fails to reject the
+//   following code with the simple definition:
+//
+//     int foo;
+//     STATIC_ASSERT(foo, msg); // not supposed to compile as foo is
+//                               // not a compile-time constant.
+//
+// - By using the type StaticAssert<(bool(expr))>, we ensures that
+//   expr is a compile-time constant.  (Template arguments must be
+//   determined at compile-time.)
+//
+// - The outer parentheses in StaticAssert<(bool(expr))> are necessary
+//   to work around a bug in gcc 3.4.4 and 4.0.1.  If we had written
+//
+//     StaticAssert<bool(expr)>
+//
+//   instead, these compilers will refuse to compile
+//
+//     STATIC_ASSERT(5 > 0, some_message);
+//
+//   (They seem to think the ">" in "5 > 0" marks the end of the
+//   template argument list.)
+//
+// - The array size is (bool(expr) ? 1 : -1), instead of simply
+//
+//     ((expr) ? 1 : -1).
+//
+//   This is to avoid running into a bug in MS VC 7.1, which
+//   causes ((0.0) ? 1 : -1) to incorrectly evaluate to 1.
+
+#define CHECK(b) do { if (!(b)) assert(false); } while(0)
+#define CHECK_EQ(a, b) CHECK((a) == (b))
+#define CHECK_GE(a, b) CHECK((a) >= (b))
+#define CHECK_LE(a, b) CHECK((a) <= (b))
+#define CHECK_GT(a, b) CHECK((a) > (b))
+#define CHECK_LT(a, b) CHECK((a) < (b))
+
+
+#define ATTRIBUTE_UNUSED  __attribute__ ((unused))
+
+#endif  // BENCHMARK_MACROS_H_
diff --git a/src/mutex_lock.h b/src/mutex_lock.h
new file mode 100644
index 00000000..40f0fdee
--- /dev/null
+++ b/src/mutex_lock.h
@@ -0,0 +1,20 @@
+#ifndef BENCHMARK_MUTEX_LOCK_H_
+#define BENCHMARK_MUTEX_LOCK_H_
+
+#include <pthread.h>
+
+class mutex_lock {
+ public:
+  explicit mutex_lock(pthread_mutex_t* mu) : mu_(mu) {
+    pthread_mutex_lock(mu_);
+  }
+
+  ~mutex_lock() {
+    pthread_mutex_unlock(mu_);
+  }
+
+ private:
+  pthread_mutex_t* mu_;
+};
+
+#endif  // BENCHMARK_MUTEX_LOCK_H_
diff --git a/src/port.h b/src/port.h
new file mode 100644
index 00000000..7d8fe1cc
--- /dev/null
+++ b/src/port.h
@@ -0,0 +1,8 @@
+#ifndef BENCHMARK_PORT_H_
+#define BENCHMARK_PORT_H_
+
+#define DISALLOW_COPY_AND_ASSIGN(TypeName) \
+  TypeName(const TypeName&);               \
+  void operator=(const TypeName&);
+
+#endif  // BENCHMARK_PORT_H_
diff --git a/src/sleep.cc b/src/sleep.cc
new file mode 100644
index 00000000..82292919
--- /dev/null
+++ b/src/sleep.cc
@@ -0,0 +1,42 @@
+#include "sleep.h"
+
+#include <time.h>
+#include <errno.h>
+
+#ifdef OS_WINDOWS
+
+// Window's _sleep takes milliseconds argument.
+void SleepForMilliseconds(int milliseconds) {
+  _sleep(milliseconds);
+}
+void SleepForSeconds(double seconds) {
+  SleepForMilliseconds(static_cast<int>(seconds * 1000));
+}
+
+#else  // OS_WINDOWS
+
+static const int64_t kNumMillisPerSecond = 1000LL;
+static const int64_t kNumMicrosPerMilli = 1000LL;
+static const int64_t kNumMicrosPerSecond = kNumMillisPerSecond * 1000LL;
+static const int64_t kNumNanosPerMicro = 1000LL;
+
+void SleepForMicroseconds(int64_t microseconds) {
+  struct timespec sleep_time;
+  sleep_time.tv_sec = microseconds / kNumMicrosPerSecond;
+  sleep_time.tv_nsec = (microseconds % kNumMicrosPerSecond) * kNumNanosPerMicro;
+  while (nanosleep(&sleep_time, &sleep_time) != 0 && errno == EINTR)
+    ;  // Ignore signals and wait for the full interval to elapse.
+}
+
+void SleepForMilliseconds(int milliseconds) {
+  SleepForMicroseconds(static_cast<int64_t>(milliseconds) * kNumMicrosPerMilli);
+}
+
+void SleepForSeconds(double seconds) {
+  SleepForMicroseconds(static_cast<int64_t>(seconds * kNumMicrosPerSecond));
+}
+
+#endif  // OS_WINDOWS
+
+
+
diff --git a/src/sleep.h b/src/sleep.h
new file mode 100644
index 00000000..35b4263b
--- /dev/null
+++ b/src/sleep.h
@@ -0,0 +1,10 @@
+#ifndef BENCHMARK_SLEEP_H_
+#define BENCHMARK_SLEEP_H_
+
+#include <stdint.h>
+
+void SleepForMicroseconds(int64_t microseconds);
+void SleepForMilliseconds(int milliseconds);
+void SleepForSeconds(double seconds);
+
+#endif  // BENCHMARK_SLEEP_H_
diff --git a/src/stat.h b/src/stat.h
new file mode 100644
index 00000000..b121d47e
--- /dev/null
+++ b/src/stat.h
@@ -0,0 +1,306 @@
+#ifndef BENCHMARK_STAT_H_
+#define BENCHMARK_STAT_H_
+
+#include <math.h>
+#include <iostream>
+#include <limits>
+
+template <typename VType, typename NumType>
+class Stat1;
+
+template <typename VType, typename NumType>
+class Stat1MinMax;
+
+typedef Stat1<float, float>  Stat1_f;
+typedef Stat1<double, double> Stat1_d;
+typedef Stat1MinMax<float, float>  Stat1MinMax_f;
+typedef Stat1MinMax<double, double> Stat1MinMax_d;
+
+template <typename VType> class Vector2;
+template <typename VType> class Vector3;
+template <typename VType> class Vector4;
+
+template <typename VType, typename NumType>
+class Stat1 {
+ public:
+  typedef Stat1<VType, NumType> Self;
+
+  Stat1()  {
+    Clear();
+  }
+  void Clear() {
+    numsamples_ = NumType();
+    sum_squares_ = sum_ = VType();
+  }
+  // Create a sample of value dat and weight 1
+  explicit Stat1(const VType &dat) {
+    sum_ = dat;
+    sum_squares_ = Sqr(dat);
+    numsamples_ = 1;
+  }
+  // Create statistics for all the samples between begin (included)
+  // and end(excluded)
+  explicit Stat1(const VType *begin, const VType *end) {
+    Clear();
+    for ( const VType *item = begin; item < end; ++item ) {
+      (*this) += Stat1(*item);
+    }
+  }
+  // Create a sample of value dat and weight w
+  Stat1(const VType &dat, const NumType &w) {
+    sum_ = w * dat;
+    sum_squares_ = w * Sqr(dat);
+    numsamples_ = w;
+  }
+  // Copy operator
+  Stat1(const Self &stat) {
+    sum_ = stat.sum_;
+    sum_squares_ = stat.sum_squares_;
+    numsamples_ = stat.numsamples_;
+  }
+
+  inline Self &operator =(const Self &stat) {
+    sum_ = stat.sum_;
+    sum_squares_ = stat.sum_squares_;
+    numsamples_ = stat.numsamples_;
+    return (*this);
+  }
+  // Merge statistics from two sample sets.
+  inline Self &operator +=(const Self &stat) {
+    sum_ += stat.sum_;
+    sum_squares_+= stat.sum_squares_;
+    numsamples_ += stat.numsamples_;
+    return (*this);
+  }
+  // The operation opposite to +=
+  inline Self &operator -=(const Self &stat) {
+    sum_ -= stat.sum_;
+    sum_squares_-= stat.sum_squares_;
+    numsamples_ -= stat.numsamples_;
+    return (*this);
+  }
+  // Multiply the weight of the set of samples by a factor k
+  inline Self &operator *=(const VType &k) {
+    sum_ *= k;
+    sum_squares_*= k;
+    numsamples_ *= k;
+    return (*this);
+  }
+  // Merge statistics from two sample sets.
+  inline Self operator + (const Self &stat) const {
+    return Self(*this) += stat;
+  }
+  // The operation opposite to +
+  inline Self operator - (const Self &stat) const {
+    return Self(*this) -= stat;
+  }
+  // Multiply the weight of the set of samples by a factor k
+  inline Self operator * (const VType &k) const {
+    return Self(*this) *= k;
+  }
+  // Return the total weight of this sample set
+  NumType NumSamples() const {
+    return numsamples_;
+  }
+  // Return the sum of this sample set
+  VType Sum() const {
+    return sum_;
+  }
+  // Return the mean of this sample set
+  VType Mean() const {
+    if (numsamples_ == 0) return VType();
+    return sum_ * (1.0 / numsamples_);
+  }
+  // Return the mean of this sample set and compute the standard deviation at
+  // the same time.
+  VType Mean(VType *stddev) const {
+    if (numsamples_ == 0) return VType();
+    VType mean = sum_ * (1.0 / numsamples_);
+    if (stddev) {
+      VType avg_squares = sum_squares_ * (1.0 / numsamples_);
+     *stddev = Sqrt(avg_squares - Sqr(mean));
+    }
+    return mean;
+  }
+  // Return the standard deviation of the sample set
+  VType StdDev() const {
+    if (numsamples_ == 0) return VType();
+    VType mean = Mean();
+    VType avg_squares = sum_squares_ * (1.0 / numsamples_);
+    return Sqrt(avg_squares - Sqr(mean));
+  }
+ private:
+                        // Let i be the index of the samples provided (using +=)
+                        // and weight[i],value[i] be the data of sample #i
+                        // then the variables have the following meaning:
+  NumType numsamples_;  // sum of weight[i];
+  VType sum_;           // sum of weight[i]*value[i];
+  VType sum_squares_;   // sum of weight[i]*value[i]^2;
+
+  // Template function used to square a number.
+  // For a vector we square all components
+  template <typename SType>
+  static inline SType Sqr(const SType &dat) {
+    return dat * dat;
+  }
+  template <typename SType>
+  static inline Vector2<SType> Sqr(const Vector2<SType> &dat) {
+    return dat.MulComponents(dat);
+  }
+  template <typename SType>
+  static inline Vector3<SType> Sqr(const Vector3<SType> &dat) {
+    return dat.MulComponents(dat);
+  }
+  template <typename SType>
+  static inline Vector4<SType> Sqr(const Vector4<SType> &dat) {
+    return dat.MulComponents(dat);
+  }
+
+  // Template function used to take the square root of a number.
+  // For a vector we square all components
+  template <typename SType>
+  static inline SType Sqrt(const SType &dat) {
+    // Avoid NaN due to imprecision in the calculations
+    if ( dat < 0 )
+      return 0;
+    return sqrt(dat);
+  }
+  template <typename SType>
+  static inline Vector2<SType> Sqrt(const Vector2<SType> &dat) {
+    // Avoid NaN due to imprecision in the calculations
+    return Max(dat, Vector2<SType>()).Sqrt();
+  }
+  template <typename SType>
+  static inline Vector3<SType> Sqrt(const Vector3<SType> &dat) {
+    // Avoid NaN due to imprecision in the calculations
+    return Max(dat, Vector3<SType>()).Sqrt();
+  }
+  template <typename SType>
+  static inline Vector4<SType> Sqrt(const Vector4<SType> &dat) {
+    // Avoid NaN due to imprecision in the calculations
+    return Max(dat, Vector4<SType>()).Sqrt();
+  }
+};
+
+// Useful printing function
+template <typename VType, typename NumType>
+inline std::ostream& operator<<(std::ostream& out,
+                                const Stat1<VType, NumType>& s) {
+  out << "{ avg = " << s.Mean()
+      << " std = " << s.StdDev()
+      << " nsamples = " << s.NumSamples() << "}";
+  return out;
+}
+
+
+// Stat1MinMax: same as Stat1, but it also
+// keeps the Min and Max values; the "-"
+// operator is disabled because it cannot be implemented
+// efficiently
+template <typename VType, typename NumType>
+class Stat1MinMax : public Stat1<VType, NumType> {
+ public:
+  typedef Stat1MinMax<VType, NumType> Self;
+
+  Stat1MinMax()  {
+    Clear();
+  }
+  void Clear() {
+    Stat1<VType, NumType>::Clear();
+    if (std::numeric_limits<VType>::has_infinity) {
+      min_ = std::numeric_limits<VType>::infinity();
+      max_ = -std::numeric_limits<VType>::infinity();
+    } else {
+      min_ = std::numeric_limits<VType>::max();
+      max_ = std::numeric_limits<VType>::min();
+    }
+  }
+  // Create a sample of value dat and weight 1
+  explicit Stat1MinMax(const VType &dat) : Stat1<VType, NumType>(dat) {
+    max_ = dat;
+    min_ = dat;
+  }
+  // Create statistics for all the samples between begin (included)
+  // and end(excluded)
+  explicit Stat1MinMax(const VType *begin, const VType *end) {
+    Clear();
+    for ( const VType *item = begin; item < end; ++item ) {
+      (*this) += Stat1MinMax(*item);
+    }
+  }
+  // Create a sample of value dat and weight w
+  Stat1MinMax(const VType &dat, const NumType &w)
+  : Stat1<VType, NumType>(dat, w) {
+    max_ = dat;
+    min_ = dat;
+  }
+  // Copy operator
+  Stat1MinMax(const Self &stat) : Stat1<VType, NumType>(stat) {
+    max_ = stat.max_;
+    min_ = stat.min_;
+  }
+  inline Self &operator =(const Self &stat) {
+    this->Stat1<VType, NumType>::operator=(stat);
+    max_ = stat.max_;
+    min_ = stat.min_;
+    return (*this);
+  }
+  // Merge statistics from two sample sets.
+  inline Self &operator +=(const Self &stat) {
+    this->Stat1<VType, NumType>::operator+=(stat);
+    if (stat.max_ > max_) max_ = stat.max_;
+    if (stat.min_ < min_) min_ = stat.min_;
+    return (*this);
+  }
+  // Multiply the weight of the set of samples by a factor k
+  inline Self &operator *=(const VType &stat) {
+    this->Stat1<VType, NumType>::operator*=(stat);
+    return (*this);
+  }
+  // Merge statistics from two sample sets.
+  inline Self operator + (const Self &stat) const {
+    return Self(*this) += stat;
+  }
+  // Multiply the weight of the set of samples by a factor k
+  inline Self operator * (const VType &k) const {
+    return Self(*this) *= k;
+  }
+ private:
+  // The - operation makes no sense with Min/Max
+  // unless we keep the full list of values (but we don't)
+  // make it private, and let it undefined so nobody can call it
+  Self &operator -=(const Self &stat);  // senseless. let it undefined.
+
+  // The operation opposite to -
+  Self operator - (const Self &stat) const;  // senseless. let it undefined.
+
+ public:
+  // Return the maximal value in this sample set
+  VType Max() const {
+    return max_;
+  }
+  // Return the minimal value in this sample set
+  VType Min() const {
+    return min_;
+  }
+ private:
+                        // Let i be the index of the samples provided (using +=)
+                        // and weight[i],value[i] be the data of sample #i
+                        // then the variables have the following meaning:
+  VType max_;           // max of value[i]
+  VType min_;           // min of value[i]
+};
+
+// Useful printing function
+template <typename VType, typename NumType>
+inline std::ostream& operator <<(std::ostream& out,
+                                 const Stat1MinMax<VType, NumType>& s) {
+  out << "{ avg = " << s.Mean()
+      << " std = " << s.StdDev()
+      << " nsamples = " << s.NumSamples()
+      << " min = " << s.Min()
+      << " max = " << s.Max() << "}";
+  return out;
+}
+
+#endif  // BENCHMARK_STAT_H_
diff --git a/src/sysinfo.cc b/src/sysinfo.cc
new file mode 100644
index 00000000..644b66f0
--- /dev/null
+++ b/src/sysinfo.cc
@@ -0,0 +1,337 @@
+#include "sysinfo.h"
+
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/resource.h>
+#include <sys/time.h>
+#include <unistd.h>
+
+#include <iostream>
+#include <limits>
+
+#include "cycleclock.h"
+#include "macros.h"
+#include "mutex_lock.h"
+#include "sleep.h"
+
+namespace {
+pthread_once_t cpuinfo_init = PTHREAD_ONCE_INIT;
+double cpuinfo_cycles_per_second = 1.0;
+int cpuinfo_num_cpus = 1;  // Conservative guess
+static pthread_mutex_t cputimens_mutex;
+
+// Helper function estimates cycles/sec by observing cycles elapsed during
+// sleep(). Using small sleep time decreases accuracy significantly.
+int64_t EstimateCyclesPerSecond(const int estimate_time_ms) {
+  CHECK(estimate_time_ms > 0);
+  double multiplier = 1000.0 / (double)estimate_time_ms;  // scale by this much
+
+  const int64_t start_ticks = CycleClock::Now();
+  SleepForMilliseconds(estimate_time_ms);
+  const int64_t guess = int64_t(multiplier * (CycleClock::Now() - start_ticks));
+  return guess;
+}
+
+// Helper function for reading an int from a file. Returns true if successful
+// and the memory location pointed to by value is set to the value read.
+bool ReadIntFromFile(const char *file, int *value) {
+  bool ret = false;
+  int fd = open(file, O_RDONLY);
+  if (fd != -1) {
+    char line[1024];
+    char* err;
+    memset(line, '\0', sizeof(line));
+    CHECK(read(fd, line, sizeof(line) - 1));
+    const int temp_value = strtol(line, &err, 10);
+    if (line[0] != '\0' && (*err == '\n' || *err == '\0')) {
+      *value = temp_value;
+      ret = true;
+    }
+    close(fd);
+  }
+  return ret;
+}
+
+void InitializeSystemInfo() {
+  bool saw_mhz = false;
+
+  // TODO: destroy this
+  pthread_mutex_init(&cputimens_mutex, NULL);
+
+#if defined OS_LINUX || defined OS_CYGWIN
+  char line[1024];
+  char* err;
+  int freq;
+
+  // If the kernel is exporting the tsc frequency use that. There are issues
+  // where cpuinfo_max_freq cannot be relied on because the BIOS may be
+  // exporintg an invalid p-state (on x86) or p-states may be used to put the
+  // processor in a new mode (turbo mode). Essentially, those frequencies
+  // cannot always be relied upon. The same reasons apply to /proc/cpuinfo as
+  // well.
+  if (!saw_mhz &&
+      ReadIntFromFile("/sys/devices/system/cpu/cpu0/tsc_freq_khz", &freq)) {
+      // The value is in kHz (as the file name suggests).  For example, on a
+      // 2GHz warpstation, the file contains the value "2000000".
+      cpuinfo_cycles_per_second = freq * 1000.0;
+      saw_mhz = true;
+  }
+
+  // If CPU scaling is in effect, we want to use the *maximum* frequency,
+  // not whatever CPU speed some random processor happens to be using now.
+  if (!saw_mhz &&
+      ReadIntFromFile("/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq",
+                      &freq)) {
+    // The value is in kHz.  For example, on a 2GHz warpstation, the file
+    // contains the value "2000000".
+    cpuinfo_cycles_per_second = freq * 1000.0;
+    saw_mhz = true;
+  }
+
+  // Read /proc/cpuinfo for other values, and if there is no cpuinfo_max_freq.
+  const char* pname = "/proc/cpuinfo";
+  int fd = open(pname, O_RDONLY);
+  if (fd == -1) {
+    perror(pname);
+    if (!saw_mhz) {
+      cpuinfo_cycles_per_second = EstimateCyclesPerSecond(1000);
+    }
+    return;          // TODO: use generic tester instead?
+  }
+
+  double bogo_clock = 1.0;
+  bool saw_bogo = false;
+  int max_cpu_id = 0;
+  int num_cpus = 0;
+  line[0] = line[1] = '\0';
+  int chars_read = 0;
+  do {   // we'll exit when the last read didn't read anything
+    // Move the next line to the beginning of the buffer
+    const int oldlinelen = strlen(line);
+    if (sizeof(line) == oldlinelen + 1)    // oldlinelen took up entire line
+      line[0] = '\0';
+    else                                   // still other lines left to save
+      memmove(line, line + oldlinelen+1, sizeof(line) - (oldlinelen+1));
+    // Terminate the new line, reading more if we can't find the newline
+    char* newline = strchr(line, '\n');
+    if (newline == NULL) {
+      const int linelen = strlen(line);
+      const int bytes_to_read = sizeof(line)-1 - linelen;
+      CHECK(bytes_to_read > 0);  // because the memmove recovered >=1 bytes
+      chars_read = read(fd, line + linelen, bytes_to_read);
+      line[linelen + chars_read] = '\0';
+      newline = strchr(line, '\n');
+    }
+    if (newline != NULL)
+      *newline = '\0';
+
+    // When parsing the "cpu MHz" and "bogomips" (fallback) entries, we only
+    // accept postive values. Some environments (virtual machines) report zero,
+    // which would cause infinite looping in WallTime_Init.
+    if (!saw_mhz && strncasecmp(line, "cpu MHz", sizeof("cpu MHz")-1) == 0) {
+      const char* freqstr = strchr(line, ':');
+      if (freqstr) {
+        cpuinfo_cycles_per_second = strtod(freqstr+1, &err) * 1000000.0;
+        if (freqstr[1] != '\0' && *err == '\0' && cpuinfo_cycles_per_second > 0)
+          saw_mhz = true;
+      }
+    } else if (strncasecmp(line, "bogomips", sizeof("bogomips")-1) == 0) {
+      const char* freqstr = strchr(line, ':');
+      if (freqstr) {
+        bogo_clock = strtod(freqstr+1, &err) * 1000000.0;
+        if (freqstr[1] != '\0' && *err == '\0' && bogo_clock > 0)
+          saw_bogo = true;
+      }
+    } else if (strncasecmp(line, "processor", sizeof("processor")-1) == 0) {
+      num_cpus++;  // count up every time we see an "processor :" entry
+      const char* freqstr = strchr(line, ':');
+      if (freqstr) {
+        const int cpu_id = strtol(freqstr+1, &err, 10);
+        if (freqstr[1] != '\0' && *err == '\0' && max_cpu_id < cpu_id)
+          max_cpu_id = cpu_id;
+      }
+    }
+  } while (chars_read > 0);
+  close(fd);
+
+  if (!saw_mhz) {
+    if (saw_bogo) {
+      // If we didn't find anything better, we'll use bogomips, but
+      // we're not happy about it.
+      cpuinfo_cycles_per_second = bogo_clock;
+    } else {
+      // If we don't even have bogomips, we'll use the slow estimation.
+      cpuinfo_cycles_per_second = EstimateCyclesPerSecond(1000);
+    }
+  }
+  if (num_cpus == 0) {
+    fprintf(stderr, "Failed to read num. CPUs correctly from /proc/cpuinfo\n");
+  } else {
+    if ((max_cpu_id + 1) != num_cpus) {
+      fprintf(stderr,
+              "CPU ID assignments in /proc/cpuinfo seems messed up."
+              " This is usually caused by a bad BIOS.\n");
+    }
+    cpuinfo_num_cpus = num_cpus;
+  }
+
+#elif defined OS_FREEBSD
+  // For this sysctl to work, the machine must be configured without
+  // SMP, APIC, or APM support.  hz should be 64-bit in freebsd 7.0
+  // and later.  Before that, it's a 32-bit quantity (and gives the
+  // wrong answer on machines faster than 2^32 Hz).  See
+  //  http://lists.freebsd.org/pipermail/freebsd-i386/2004-November/001846.html
+  // But also compare FreeBSD 7.0:
+  //  http://fxr.watson.org/fxr/source/i386/i386/tsc.c?v=RELENG70#L223
+  //  231         error = sysctl_handle_quad(oidp, &freq, 0, req);
+  // To FreeBSD 6.3 (it's the same in 6-STABLE):
+  //  http://fxr.watson.org/fxr/source/i386/i386/tsc.c?v=RELENG6#L131
+  //  139         error = sysctl_handle_int(oidp, &freq, sizeof(freq), req);
+#if __FreeBSD__ >= 7
+  uint64_t hz = 0;
+#else
+  unsigned int hz = 0;
+#endif
+  size_t sz = sizeof(hz);
+  const char *sysctl_path = "machdep.tsc_freq";
+  if ( sysctlbyname(sysctl_path, &hz, &sz, NULL, 0) != 0 ) {
+    fprintf(stderr, "Unable to determine clock rate from sysctl: %s: %s\n",
+            sysctl_path, strerror(errno));
+    cpuinfo_cycles_per_second = EstimateCyclesPerSecond(1000);
+  } else {
+    cpuinfo_cycles_per_second = hz;
+  }
+  // TODO: also figure out cpuinfo_num_cpus
+
+#elif defined OS_WINDOWS
+# pragma comment(lib, "shlwapi.lib")  // for SHGetValue()
+  // In NT, read MHz from the registry. If we fail to do so or we're in win9x
+  // then make a crude estimate.
+  OSVERSIONINFO os;
+  os.dwOSVersionInfoSize = sizeof(os);
+  DWORD data, data_size = sizeof(data);
+  if (GetVersionEx(&os) &&
+      os.dwPlatformId == VER_PLATFORM_WIN32_NT &&
+      SUCCEEDED(SHGetValueA(HKEY_LOCAL_MACHINE,
+                         "HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0",
+                           "~MHz", NULL, &data, &data_size)))
+    cpuinfo_cycles_per_second = (int64)data * (int64)(1000 * 1000); // was mhz
+  else
+    cpuinfo_cycles_per_second = EstimateCyclesPerSecond(500); // TODO <500?
+  // TODO: also figure out cpuinfo_num_cpus
+
+#elif defined OS_MACOSX
+  // returning "mach time units" per second. the current number of elapsed
+  // mach time units can be found by calling uint64 mach_absolute_time();
+  // while not as precise as actual CPU cycles, it is accurate in the face
+  // of CPU frequency scaling and multi-cpu/core machines.
+  // Our mac users have these types of machines, and accuracy
+  // (i.e. correctness) trumps precision.
+  // See cycleclock.h: CycleClock::Now(), which returns number of mach time
+  // units on Mac OS X.
+  mach_timebase_info_data_t timebase_info;
+  mach_timebase_info(&timebase_info);
+  double mach_time_units_per_nanosecond =
+      static_cast<double>(timebase_info.denom) /
+      static_cast<double>(timebase_info.numer);
+  cpuinfo_cycles_per_second = mach_time_units_per_nanosecond * 1e9;
+
+  int num_cpus = 0;
+  size_t size = sizeof(num_cpus);
+  int numcpus_name[] = { CTL_HW, HW_NCPU };
+  if (::sysctl(numcpus_name, arraysize(numcpus_name), &num_cpus, &size, 0, 0)
+      == 0
+      && (size == sizeof(num_cpus)))
+    cpuinfo_num_cpus = num_cpus;
+
+#else
+  // Generic cycles per second counter
+  cpuinfo_cycles_per_second = EstimateCyclesPerSecond(1000);
+#endif
+}
+}  // end namespace
+
+#ifndef OS_WINDOWS
+// getrusage() based implementation of MyCPUUsage
+static double MyCPUUsageRUsage() {
+  struct rusage ru;
+  if (getrusage(RUSAGE_SELF, &ru) == 0) {
+    return (static_cast<double>(ru.ru_utime.tv_sec)      +
+            static_cast<double>(ru.ru_utime.tv_usec)*1e-6 +
+            static_cast<double>(ru.ru_stime.tv_sec)      +
+            static_cast<double>(ru.ru_stime.tv_usec)*1e-6);
+  } else {
+    return 0.0;
+  }
+}
+
+static bool MyCPUUsageCPUTimeNsLocked(double *cputime) {
+  static int cputime_fd = -1;
+  if (cputime_fd == -1) {
+    cputime_fd = open("/proc/self/cputime_ns", O_RDONLY);
+    if (cputime_fd < 0) {
+      cputime_fd = -1;
+      return false;
+    }
+  }
+  char buff[64];
+  memset(buff, 0, sizeof(buff));
+  if (pread(cputime_fd, buff, sizeof(buff)-1, 0) <= 0) {
+    close(cputime_fd);
+    cputime_fd = -1;
+    return false;
+  }
+  unsigned long long result = strtoull(buff, NULL, 0);
+  if (result == (std::numeric_limits<unsigned long long>::max)()) {
+    close(cputime_fd);
+    cputime_fd = -1;
+    return false;
+  }
+  *cputime = static_cast<double>(result) / 1e9;
+  return true;
+}
+
+double MyCPUUsage() {
+  {
+    mutex_lock l(&cputimens_mutex);
+    static bool use_cputime_ns = true;
+    if (use_cputime_ns) {
+      double value;
+      if (MyCPUUsageCPUTimeNsLocked(&value)) {
+        return value;
+      }
+      // Once MyCPUUsageCPUTimeNsLocked fails once fall back to getrusage().
+      std::cout << "Reading /proc/self/cputime_ns failed. Using getrusage().\n";
+      use_cputime_ns = false;
+    }
+  }
+  return MyCPUUsageRUsage();
+}
+
+double ChildrenCPUUsage() {
+  struct rusage ru;
+  if (getrusage(RUSAGE_CHILDREN, &ru) == 0) {
+    return (static_cast<double>(ru.ru_utime.tv_sec)      +
+            static_cast<double>(ru.ru_utime.tv_usec)*1e-6 +
+            static_cast<double>(ru.ru_stime.tv_sec)      +
+            static_cast<double>(ru.ru_stime.tv_usec)*1e-6);
+  } else {
+    return 0.0;
+  }
+}
+#endif  // OS_WINDOWS
+
+double CyclesPerSecond(void) {
+  pthread_once(&cpuinfo_init, &InitializeSystemInfo);
+  return cpuinfo_cycles_per_second;
+}
+
+int NumCPUs(void) {
+  pthread_once(&cpuinfo_init, &InitializeSystemInfo);
+  return cpuinfo_num_cpus;
+}
+
diff --git a/src/sysinfo.h b/src/sysinfo.h
new file mode 100644
index 00000000..0b85d5cc
--- /dev/null
+++ b/src/sysinfo.h
@@ -0,0 +1,9 @@
+#ifndef BENCHMARK_SYSINFO_H_
+#define BENCHMARK_SYSINFO_H_
+
+double MyCPUUsage();
+double ChildrenCPUUsage();
+int NumCPUs();
+double CyclesPerSecond();
+
+#endif  // BENCHMARK_SYSINFO_H_
diff --git a/src/walltime.cc b/src/walltime.cc
new file mode 100644
index 00000000..85384aac
--- /dev/null
+++ b/src/walltime.cc
@@ -0,0 +1,137 @@
+#include "walltime.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <sys/time.h>
+#include <time.h>
+
+#include <atomic>
+#include <limits>
+
+#include "cycleclock.h"
+#include "macros.h"
+#include "sysinfo.h"
+
+namespace walltime {
+namespace {
+const double kMaxErrorInterval = 100e-6;
+
+std::atomic<bool> initialized(false);
+WallTime base_walltime = 0.0;
+int64_t base_cycletime = 0;
+int64_t cycles_per_second;
+double seconds_per_cycle;
+uint32_t last_adjust_time = 0;
+std::atomic<int32_t> drift_adjust(0);
+int64_t max_interval_cycles = 0;
+
+// Helper routines to load/store a float from an AtomicWord. Required because
+// g++ < 4.7 doesn't support std::atomic<float> correctly. I cannot wait to get
+// rid of this horror show.
+inline void SetDrift(float f) {
+  int32_t w;
+  memcpy(&w, &f, sizeof(f));
+  std::atomic_store(&drift_adjust, w);
+}
+
+inline float GetDrift() {
+  float f;
+  int32_t w = std::atomic_load(&drift_adjust);
+  memcpy(&f, &w, sizeof(f));
+  return f;
+}
+
+static_assert(sizeof(float) <= sizeof(int32_t),
+              "type sizes don't allow the drift_adjust hack");
+
+WallTime Slow() {
+  struct timeval tv;
+  gettimeofday(&tv, NULL);
+  return tv.tv_sec + tv.tv_usec * 1e-6;
+}
+
+bool SplitTimezone(WallTime value, bool local, struct tm* t,
+                   double* subsecond) {
+  memset(t, 0, sizeof(*t));
+  if ((value < 0) || (value > std::numeric_limits<time_t>::max())) {
+    *subsecond = 0.0;
+    return false;
+  }
+  const time_t whole_time = static_cast<time_t>(value);
+  *subsecond = value - whole_time;
+  if (local)
+    localtime_r(&whole_time, t);
+  else
+    gmtime_r(&whole_time, t);
+  return true;
+}
+}  // end namespace
+
+// This routine should be invoked to initialize walltime.
+// It is not intended for general purpose use.
+void Initialize() {
+  CHECK(!std::atomic_load(&initialized));
+  cycles_per_second = static_cast<int64_t>(CyclesPerSecond());
+  CHECK(cycles_per_second != 0);
+  seconds_per_cycle = 1.0 / cycles_per_second;
+  max_interval_cycles = static_cast<int64_t>(
+      cycles_per_second * kMaxErrorInterval);
+  do {
+    base_cycletime = CycleClock::Now();
+    base_walltime = Slow();
+  } while (CycleClock::Now() - base_cycletime > max_interval_cycles);
+  // We are now sure that "base_walltime" and "base_cycletime" were produced
+  // within kMaxErrorInterval of one another.
+
+  SetDrift(0.0);
+  last_adjust_time = static_cast<uint32_t>(uint64_t(base_cycletime) >> 32);
+  std::atomic_store(&initialized, true);
+}
+
+WallTime Now() {
+  if (!std::atomic_load(&initialized))
+    return Slow();
+
+  WallTime now = 0.0;
+  WallTime result = 0.0;
+  int64_t ct = 0;
+  uint32_t top_bits = 0;
+  do {
+    ct = CycleClock::Now();
+    int64_t cycle_delta = ct - base_cycletime;
+    result = base_walltime + cycle_delta * seconds_per_cycle;
+
+    top_bits = static_cast<uint32_t>(uint64_t(ct) >> 32);
+    // Recompute drift no more often than every 2^32 cycles.
+    // I.e., @2GHz, ~ every two seconds
+    if (top_bits == last_adjust_time) { // don't need to recompute drift
+      return result + GetDrift();
+    }
+
+    now = Slow();
+  } while (CycleClock::Now() - ct > max_interval_cycles);
+  // We are now sure that "now" and "result" were produced within
+  // kMaxErrorInterval of one another.
+
+  SetDrift(now - result);
+  last_adjust_time = top_bits;
+  return now;
+}
+
+const char* Print(WallTime time, const char *format, bool local,
+                  char* storage, int *remainder_us) {
+    struct tm split;
+    double subsecond;
+    if (!SplitTimezone(time, local, &split, &subsecond)) {
+      snprintf(storage, sizeof(storage), "Invalid time: %f", time);
+    } else {
+      if (remainder_us != NULL) {
+        *remainder_us = static_cast<int>((subsecond * 1000000) + 0.5);
+        if (*remainder_us > 999999) *remainder_us = 999999;
+        if (*remainder_us < 0)      *remainder_us = 0;
+      }
+      strftime(storage, sizeof(storage), format, &split);
+    }
+    return storage;
+}
+}  // end namespace walltime
diff --git a/src/walltime.h b/src/walltime.h
new file mode 100644
index 00000000..13cda806
--- /dev/null
+++ b/src/walltime.h
@@ -0,0 +1,19 @@
+#ifndef BENCHMARK_WALLTIME_H_
+#define BENCHMARK_WALLTIME_H_
+
+typedef double WallTime;
+
+namespace walltime {
+void Initialize();
+WallTime Now();
+
+// GIVEN: walltime, generic format string (as understood by strftime),
+// a boolean flag specifying if the time is local or UTC (true=local).
+// RETURNS: the formatted string. ALSO RETURNS: the storage printbuffer
+// passed and the remaining number of microseconds (never printed in
+// the string since strftime does not understand it)
+const char* Print(WallTime time, const char *format, bool local,
+                  char* storage, int *remainder_us);
+}  // end namespace walltime
+
+#endif  // BENCHMARK_WALLTIME_H_
diff --git a/test/benchmark_test.cc b/test/benchmark_test.cc
new file mode 100644
index 00000000..7c0c6d77
--- /dev/null
+++ b/test/benchmark_test.cc
@@ -0,0 +1,138 @@
+#include "benchmark/benchmark.h"
+
+#include <math.h>
+#include <stdint.h>
+
+#include <limits>
+#include <list>
+#include <map>
+#include <set>
+#include <sstream>
+#include <vector>
+
+namespace {
+
+int ATTRIBUTE_NOINLINE Factorial(uint32_t n) {
+  return (n == 1) ? 1 : n * Factorial(n - 1);
+}
+
+double CalculatePi(int depth) {
+  double pi = 0.0;
+  for (int i = 0; i < depth; ++i) {
+    double numerator = static_cast<double>(((i % 2) * 2) - 1);
+    double denominator = static_cast<double>((2 * i) - 1);
+    pi += numerator / denominator;
+  }
+  return (pi - 1.0) * 4;
+}
+
+std::set<int> ConstructRandomSet(int size) {
+  std::set<int> s;
+  for (int i = 0; i < size; ++i)
+    s.insert(i);
+  return s;
+}
+
+static std::vector<int>* test_vector = NULL;
+
+}  // end namespace
+
+#ifdef DEBUG
+static void BM_Factorial(benchmark::State& state) {
+  int fac_42 = 0;
+  while (state.KeepRunning())
+    fac_42 = Factorial(8);
+  // Prevent compiler optimizations
+  CHECK(fac_42 != std::numeric_limits<int>::max());
+}
+BENCHMARK(BM_Factorial);
+#endif
+
+static void BM_CalculatePiRange(benchmark::State& state) {
+  double pi = 0.0;
+  while (state.KeepRunning())
+    pi = CalculatePi(state.range_x());
+  std::stringstream ss;
+  ss << pi;
+  state.SetLabel(ss.str());
+}
+BENCHMARK_RANGE(BM_CalculatePiRange, 1, 1024 * 1024);
+
+static void BM_CalculatePi(benchmark::State& state) {
+  static const int depth = 1024;
+  double pi ATTRIBUTE_UNUSED = 0.0;
+  while (state.KeepRunning()) {
+    pi = CalculatePi(depth);
+  }
+}
+BENCHMARK(BM_CalculatePi)->Threads(8);
+BENCHMARK(BM_CalculatePi)->ThreadRange(1, 32);
+BENCHMARK(BM_CalculatePi)->ThreadPerCpu();
+
+static void BM_SetInsert(benchmark::State& state) {
+  while (state.KeepRunning()) {
+    state.PauseTiming();
+    std::set<int> data = ConstructRandomSet(state.range_x());
+    state.ResumeTiming();
+    for (int j = 0; j < state.range_y(); ++j)
+      data.insert(rand());
+  }
+}
+BENCHMARK(BM_SetInsert)->RangePair(1<<10,8<<10, 1,10);
+
+template<typename Q>
+static void BM_Sequential(benchmark::State& state) {
+  Q q;
+  typename Q::value_type v;
+  while (state.KeepRunning())
+    for (int i = state.range_x(); --i; )
+      q.push_back(v);
+  const int64_t items_processed = 
+      static_cast<int64_t>(state.iterations()) * state.range_x();
+  state.SetItemsProcessed(items_processed);
+  state.SetBytesProcessed(items_processed * sizeof(v));
+}
+BENCHMARK_TEMPLATE(BM_Sequential, std::vector<int>)->Range(1 << 0, 1 << 10);
+BENCHMARK_TEMPLATE(BM_Sequential, std::list<int>)->Range(1 << 0, 1 << 10);
+
+static void BM_StringCompare(benchmark::State& state) {
+  std::string s1(state.range_x(), '-');
+  std::string s2(state.range_x(), '-');
+  int r = 0;
+  while (state.KeepRunning())
+    r |= s1.compare(s2);
+  // Prevent compiler optimizations
+  CHECK(r != std::numeric_limits<int>::max());
+}
+BENCHMARK(BM_StringCompare)->Range(1, 1<<20);
+
+static void BM_SetupTeardown(benchmark::State& state) {
+  if (state.thread_index == 0)
+    test_vector = new std::vector<int>();
+  while (state.KeepRunning())
+    test_vector->push_back(0);
+  if (state.thread_index == 0) {
+    delete test_vector;
+    test_vector = NULL;
+  }
+}
+BENCHMARK(BM_SetupTeardown);
+
+static void BM_LongTest(benchmark::State& state) {
+  double tracker = 0.0;
+  while (state.KeepRunning())
+    for (int i = 0; i < state.range_x(); ++i)
+      tracker += i;
+  CHECK(tracker != 0.0);
+}
+BENCHMARK(BM_LongTest)->Range(1<<16,1<<28);
+
+int main(int argc, const char* argv[]) {
+  benchmark::Initialize(&argc, argv);
+
+  CHECK(Factorial(8) == 40320);
+  CHECK(CalculatePi(1) == 0.0);
+
+  benchmark::RunSpecifiedBenchmarks();
+}
+