From 094c67de88f41eae494a3823f8aaf0f77b25b980 Mon Sep 17 00:00:00 2001
From: ckennelly <ckennelly@google.com>
Date: Mon, 27 Jun 2016 05:01:31 -0700
Subject: [PATCH] Speed up the EmitLiteral fast path, +1.62% for ZFlat
 benchmarks.

This is inspired by the Go version in
//third_party/golang/snappy/encode_amd64.s (emitLiteralFastPath)

        Benchmark         Base:Reference   (1)
--------------------------------------------------
(BM_ZFlat_0 1/cputime_ns)        9.669e-06  +1.65%
(BM_ZFlat_1 1/cputime_ns)        7.643e-07  +2.53%
(BM_ZFlat_10 1/cputime_ns)       1.107e-05  -0.97%
(BM_ZFlat_11 1/cputime_ns)       3.002e-06  +0.71%
(BM_ZFlat_12 1/cputime_ns)       2.338e-05  +7.22%
(BM_ZFlat_13 1/cputime_ns)       6.386e-05  +9.18%
(BM_ZFlat_14 1/cputime_ns)       0.0002256  -0.05%
(BM_ZFlat_15 1/cputime_ns)       7.608e-07  -1.29%
(BM_ZFlat_16 1/cputime_ns)        0.003236  -1.28%
(BM_ZFlat_17 1/cputime_ns)        2.58e-06  +0.52%
(BM_ZFlat_18 1/cputime_ns)         0.01538  +0.00%
(BM_ZFlat_19 1/cputime_ns)       1.436e-05  +6.21%
(BM_ZFlat_2 1/cputime_ns)        0.0001044  +4.99%
(BM_ZFlat_20 1/cputime_ns)       0.0001608  -0.18%
(BM_ZFlat_3 1/cputime_ns)         0.003745  +0.38%
(BM_ZFlat_4 1/cputime_ns)        8.144e-05  +6.21%
(BM_ZFlat_5 1/cputime_ns)        2.328e-06  -1.60%
(BM_ZFlat_6 1/cputime_ns)        2.391e-06  +0.06%
(BM_ZFlat_7 1/cputime_ns)         2.68e-06  -0.61%
(BM_ZFlat_8 1/cputime_ns)        8.852e-07  +0.19%
(BM_ZFlat_9 1/cputime_ns)        6.441e-07  +1.06%

geometric mean                              +1.62%
---
 snappy.cc | 38 +++++++++++++++++++++-----------------
 1 file changed, 21 insertions(+), 17 deletions(-)

diff --git a/snappy.cc b/snappy.cc
index 3b9988a..089219c 100644
--- a/snappy.cc
+++ b/snappy.cc
@@ -157,26 +157,30 @@ static inline char* EmitLiteral(char* op,
                                 const char* literal,
                                 int len,
                                 bool allow_fast_path) {
-  int n = len - 1;      // Zero-length literals are disallowed
-  if (n < 60) {
+  // The vast majority of copies are below 16 bytes, for which a
+  // call to memcpy is overkill. This fast path can sometimes
+  // copy up to 15 bytes too much, but that is okay in the
+  // main loop, since we have a bit to go on for both sides:
+  //
+  //   - The input will always have kInputMarginBytes = 15 extra
+  //     available bytes, as long as we're in the main loop, and
+  //     if not, allow_fast_path = false.
+  //   - The output will always have 32 spare bytes (see
+  //     MaxCompressedLength).
+  assert(len > 0);      // Zero-length literals are disallowed
+  int n = len - 1;
+  if (allow_fast_path && len <= 16) {
     // Fits in tag byte
     *op++ = LITERAL | (n << 2);
 
-    // The vast majority of copies are below 16 bytes, for which a
-    // call to memcpy is overkill. This fast path can sometimes
-    // copy up to 15 bytes too much, but that is okay in the
-    // main loop, since we have a bit to go on for both sides:
-    //
-    //   - The input will always have kInputMarginBytes = 15 extra
-    //     available bytes, as long as we're in the main loop, and
-    //     if not, allow_fast_path = false.
-    //   - The output will always have 32 spare bytes (see
-    //     MaxCompressedLength).
-    if (allow_fast_path && len <= 16) {
-      UnalignedCopy64(literal, op);
-      UnalignedCopy64(literal + 8, op + 8);
-      return op + len;
-    }
+    UnalignedCopy64(literal, op);
+    UnalignedCopy64(literal + 8, op + 8);
+    return op + len;
+  }
+
+  if (n < 60) {
+    // Fits in tag byte
+    *op++ = LITERAL | (n << 2);
   } else {
     // Encode in upcoming bytes
     char* base = op;