bench: Adding a simple benchmarking setup

2014-05-07 17:42:57 -07:00 · 2014-05-07 17:42:57 -07:00 · 1aabbb5542
parent 01e085cc64
commit 1aabbb5542
8 changed files with 358 additions and 0 deletions
--- a/bench/Makefile
+++ b/bench/Makefile
@ -0,0 +1,24 @@
+REQ=20480
+CLIENTS=64
+ADDR=http://localhost:8500/v1/kv/bench
+DATA="74a31e96-1d0f-4fa7-aa14-7212a326986e"
+MAXPROCS=4
+
+all: put get-default get-stale get-consistent
+
+put:
+	@echo "===== PUT test ====="
+	GOMAXPROCS=${MAXPROCS} boom -m PUT -d ${DATA} -n ${REQ} -c ${CLIENTS} ${ADDR}
+
+get-default:
+	@echo "===== GET default test ====="
+	GOMAXPROCS=${MAXPROCS} boom -n ${REQ} -c ${CLIENTS} ${ADDR}
+
+get-stale:
+	@echo "===== GET stale test ====="
+	GOMAXPROCS=${MAXPROCS} boom -n ${REQ} -c ${CLIENTS} ${ADDR}?stale
+
+get-consistent:
+	@echo "===== GET consistent test ====="
+	GOMAXPROCS=${MAXPROCS} boom -n ${REQ} -c ${CLIENTS} ${ADDR}?consistent
+
--- a/bench/README.md
+++ b/bench/README.md
@ -0,0 +1,35 @@
+Consul Benchmark
+================
+
+This repo contains the Packer automation necessary for the Consul benchmarks.
+
+There is a single main Packer file `bench.json`. To use it, the variables
+for `do_client_id` and `do_api_key` must be provided. There correspond to
+your DigitalOcean client id and API key.
+
+When Packer runs, it will generate 3 images:
+* bench-bootstrap - Consul server in bootstrap mode
+* bench-server - Consul server
+* bench-worker - Worker node
+
+For the benchmark you should start 1 bootstrap instance, and 2 normal
+servers. As many workers as desired can be started. Once the nodes are
+up, you must SSH into one of the Consul servers.
+
+Connect all the nodes with:
+
+    $ consul join <n1> ... <n5>
+
+This will connect all the nodes within the same datacenter.
+
+To run the benchmarks, use the Makefile:
+
+    $ cd /
+    $ make # Runs all the benchmarks
+    $ make put # Runs only the PUT benchmarks
+
+There is no good way to currently cause multiple workers to run at the same
+time, so I just type in the make command and rapidly start the test on all
+workers. It is not perfect, but the test runs long enough that the calls
+overlap.
+
--- a/bench/bench.json
+++ b/bench/bench.json
@ -0,0 +1,86 @@
+{
+    "variables": {
+        "do_client_id": "",
+        "do_api_key": ""
+    },
+    "builders": [
+        {
+        "type": "digitalocean",
+        "api_key": "{{ user `do_api_key` }}",
+        "client_id": "{{ user `do_client_id` }}",
+        "region_id": "1",
+        "size_id": "61",
+        "image_id": "3101045",
+        "snapshot_name": "bench-bootstrap-{{ isotime }}",
+        "name": "bootstrap"
+    },
+    {
+        "type": "digitalocean",
+        "api_key": "{{ user `do_api_key` }}",
+        "client_id": "{{ user `do_client_id` }}",
+        "region_id": "1",
+        "size_id": "61",
+        "image_id": "3101045",
+        "snapshot_name": "bench-server-{{ isotime }}",
+        "name": "server"
+    },
+    {
+        "type": "digitalocean",
+        "api_key": "{{ user `do_api_key` }}",
+        "client_id": "{{ user `do_client_id` }}",
+        "region_id": "1",
+        "size_id": "61",
+        "image_id": "3101045",
+        "snapshot_name": "bench-worker-{{ isotime }}",
+        "name": "worker"
+    }
+    ],
+    "provisioners":[
+    {
+        "type": "file",
+        "source": "conf/upstart.conf",
+        "destination": "/etc/init/consul.conf"
+    },
+    {
+        "type": "shell",
+        "inline": [
+            "mkdir /etc/consul.d",
+            "apt-get update",
+            "apt-get install unzip make",
+            "wget https://dl.bintray.com/mitchellh/consul/0.2.0_linux_amd64.zip",
+            "unzip 0.2.0_linux_amd64.zip",
+            "mv consul /usr/local/bin/consul",
+            "chmod +x /usr/local/bin/consul"
+        ]
+    },
+    {
+        "type": "file",
+        "source": "conf/common.json",
+        "destination": "/etc/consul.d/common.json"
+    },
+    {
+        "type": "file",
+        "source": "conf/bootstrap.json",
+        "destination": "/etc/consul.d/bootstrap.json",
+        "only": ["bootstrap"]
+    },
+    {
+        "type": "file",
+        "source": "conf/server.json",
+        "destination": "/etc/consul.d/server.json",
+        "only": ["server"]
+    },
+    {
+        "type": "shell",
+        "inline": [
+            "curl https://s3.amazonaws.com/hc-ops/boom_linux_amd64 -o /usr/bin/boom",
+            "chmod +x /usr/bin/boom"
+        ]
+    },
+    {
+        "type": "file",
+        "source": "Makefile",
+        "destination": "/Makefile"
+    }
+    ]
+}
--- a/bench/conf/bootstrap.json
+++ b/bench/conf/bootstrap.json
@ -0,0 +1,4 @@
+{
+    "bootstrap": true,
+    "server": true
+}
--- a/bench/conf/common.json
+++ b/bench/conf/common.json
@ -0,0 +1,4 @@
+{
+    "data_dir": "/var/lib/consul",
+    "log_level": "info"
+}
--- a/bench/conf/server.json
+++ b/bench/conf/server.json
@ -0,0 +1,3 @@
+{
+    "server": true
+}
--- a/bench/conf/upstart.conf
+++ b/bench/conf/upstart.conf
@ -0,0 +1,24 @@
+description "Consul agent"
+
+start on runlevel [2345]
+stop on runlevel [!2345]
+
+respawn
+
+script
+  if [ -f "/etc/service/consul" ]; then
+    . /etc/service/consul
+  fi
+
+  # Make sure to use all our CPUs, because Consul can block a scheduler thread
+  export GOMAXPROCS=`nproc`
+
+  # Get the public IP
+  BIND=`ifconfig eth0 | grep "inet addr" | awk '{ print substr($2,6) }'`
+
+  exec /usr/local/bin/consul agent \
+    -config-dir="/etc/consul.d" \
+    -bind=$BIND \
+    ${CONSUL_FLAGS} \
+    >>/var/log/consul.log 2>&1
+end script
--- a/bench/results.md
+++ b/bench/results.md
@ -0,0 +1,178 @@
+# Consul Benchmark Results
+
+As part of a benchmark, we started a 5 node DigitalOcean cluster to do.
+There are 3 servers, meaning writes must commit to at least 2 servers.
+The cluster uses the 16GB DigitalOcean droplet which has the following specs:
+
+ * 8 CPU Cores, 2Ghz
+ * 16GB RAM
+ * 160GB SSD disk
+ * 1Gbps NIC
+
+We used `bonnie++` to benchmark the disk, and the key metrics are:
+
+ * 188MB/s sequential write
+ * 86MB/s sequential read-write-flush
+ * 840MB/s sequential read
+ * 2636 random seeks per second
+
+# Output
+
+Below is the output for a test run on a benchmark cluster. We ran the benchmark
+several times to warm up the nodes, and this is just a single representative sample.
+
+Note, that a single worker was running the benchmark. This means the "stale" test is
+not representative of total throughput, as the client was only routing to a single server.
+
+    ===== PUT test =====
+    GOMAXPROCS=4 boom -m PUT -d "74a31e96-1d0f-4fa7-aa14-7212a326986e" -n 20480 -c 64 http://localhost:8500/v1/kv/bench
+    20480 / 20480 Booooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo! 100.00 %
+
+    Summary:
+      Total:	19.4302 secs.
+      Slowest:	0.1715 secs.
+      Fastest:	0.0157 secs.
+      Average:	0.0606 secs.
+      Requests/sec:	1054.0313
+      Total Data Received:	102400 bytes.
+      Response Size per Request:	5 bytes.
+
+    Status code distribution:
+      [200]	20480 responses
+
+    Response time histogram:
+      0.016 [1]	|
+      0.031 [233]	|∎
+      0.047 [4120]	|∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
+      0.062 [8079]	|∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
+      0.078 [5082]	|∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
+      0.094 [2045]	|∎∎∎∎∎∎∎∎∎∎
+      0.109 [656]	|∎∎∎
+      0.125 [200]	|
+      0.140 [12]	|
+      0.156 [31]	|
+      0.172 [21]	|
+
+    Latency distribution:
+      10% in 0.0416 secs.
+      25% in 0.0484 secs.
+      50% in 0.0579 secs.
+      75% in 0.0697 secs.
+      90% in 0.0835 secs.
+      95% in 0.0919 secs.
+      99% in 0.1113 secs.
+
+    ===== GET default test =====
+    GOMAXPROCS=4 boom -n 20480 -c 64 http://localhost:8500/v1/kv/bench
+    20480 / 20480 Booooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo! 100.00 %
+
+    Summary:
+      Total:	9.6804 secs.
+      Slowest:	0.0830 secs.
+      Fastest:	0.0023 secs.
+      Average:	0.0302 secs.
+      Requests/sec:	2115.6096
+      Total Data Received:	2560000 bytes.
+      Response Size per Request:	125 bytes.
+
+    Status code distribution:
+      [200]	20480 responses
+
+    Response time histogram:
+      0.002 [1]	|
+      0.010 [143]	|
+      0.018 [1666]	|∎∎∎∎∎∎∎∎∎
+      0.026 [6009]	|∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
+      0.035 [6732]	|∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
+      0.043 [3857]	|∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
+      0.051 [1389]	|∎∎∎∎∎∎∎∎
+      0.059 [459]	|∎∎
+      0.067 [154]	|
+      0.075 [53]	|
+      0.083 [17]	|
+
+    Latency distribution:
+      10% in 0.0189 secs.
+      25% in 0.0233 secs.
+      50% in 0.0291 secs.
+      75% in 0.0358 secs.
+      90% in 0.0427 secs.
+      95% in 0.0476 secs.
+      99% in 0.0597 secs.
+
+    ===== GET stale test =====
+    GOMAXPROCS=4 boom -n 20480 -c 64 http://localhost:8500/v1/kv/bench?stale
+    20480 / 20480 Booooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo! 100.00 %
+
+    Summary:
+      Total:	10.3082 secs.
+      Slowest:	0.0972 secs.
+      Fastest:	0.0015 secs.
+      Average:	0.0322 secs.
+      Requests/sec:	1986.7714
+      Total Data Received:	2560000 bytes.
+      Response Size per Request:	125 bytes.
+
+    Status code distribution:
+      [200]	20480 responses
+
+    Response time histogram:
+      0.002 [1]	|
+      0.011 [320]	|∎
+      0.021 [2558]	|∎∎∎∎∎∎∎∎∎∎∎∎∎∎
+      0.030 [6247]	|∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
+      0.040 [6895]	|∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
+      0.049 [3174]	|∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
+      0.059 [971]	|∎∎∎∎∎
+      0.068 [249]	|∎
+      0.078 [52]	|
+      0.088 [11]	|
+      0.097 [2]	|
+
+    Latency distribution:
+      10% in 0.0187 secs.
+      25% in 0.0246 secs.
+      50% in 0.0317 secs.
+      75% in 0.0387 secs.
+      90% in 0.0461 secs.
+      95% in 0.0511 secs.
+      99% in 0.0618 secs.
+
+    ===== GET consistent test =====
+    GOMAXPROCS=4 boom -n 20480 -c 64 http://localhost:8500/v1/kv/bench?consistent
+    20480 / 20480 Booooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo! 100.00 %
+
+    Summary:
+      Total:	10.4835 secs.
+      Slowest:	0.0991 secs.
+      Fastest:	0.0024 secs.
+      Average:	0.0327 secs.
+      Requests/sec:	1953.5549
+      Total Data Received:	2560000 bytes.
+      Response Size per Request:	125 bytes.
+
+    Status code distribution:
+      [200]	20480 responses
+
+    Response time histogram:
+      0.002 [1]	|
+      0.012 [137]	|
+      0.022 [2405]	|∎∎∎∎∎∎∎∎∎∎∎∎
+      0.031 [7754]	|∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
+      0.041 [6382]	|∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
+      0.051 [2629]	|∎∎∎∎∎∎∎∎∎∎∎∎∎
+      0.060 [826]	|∎∎∎∎
+      0.070 [245]	|∎
+      0.080 [81]	|
+      0.089 [17]	|
+      0.099 [3]	|
+
+    Latency distribution:
+      10% in 0.0208 secs.
+      25% in 0.0254 secs.
+      50% in 0.0314 secs.
+      75% in 0.0384 secs.
+      90% in 0.0463 secs.
+      95% in 0.0518 secs.
+      99% in 0.0645 secs.
+