Speed up the EmitLiteral fast path, +1.62% for ZFlat benchmarks.

This is inspired by the Go version in //third_party/golang/snappy/encode_amd64.s (emitLiteralFastPath) Benchmark Base:Reference (1) -------------------------------------------------- (BM_ZFlat_0 1/cputime_ns) 9.669e-06 +1.65% (BM_ZFlat_1 1/cputime_ns) 7.643e-07 +2.53% (BM_ZFlat_10 1/cputime_ns) 1.107e-05 -0.97% (BM_ZFlat_11 1/cputime_ns) 3.002e-06 +0.71% (BM_ZFlat_12 1/cputime_ns) 2.338e-05 +7.22% (BM_ZFlat_13 1/cputime_ns) 6.386e-05 +9.18% (BM_ZFlat_14 1/cputime_ns) 0.0002256 -0.05% (BM_ZFlat_15 1/cputime_ns) 7.608e-07 -1.29% (BM_ZFlat_16 1/cputime_ns) 0.003236 -1.28% (BM_ZFlat_17 1/cputime_ns) 2.58e-06 +0.52% (BM_ZFlat_18 1/cputime_ns) 0.01538 +0.00% (BM_ZFlat_19 1/cputime_ns) 1.436e-05 +6.21% (BM_ZFlat_2 1/cputime_ns) 0.0001044 +4.99% (BM_ZFlat_20 1/cputime_ns) 0.0001608 -0.18% (BM_ZFlat_3 1/cputime_ns) 0.003745 +0.38% (BM_ZFlat_4 1/cputime_ns) 8.144e-05 +6.21% (BM_ZFlat_5 1/cputime_ns) 2.328e-06 -1.60% (BM_ZFlat_6 1/cputime_ns) 2.391e-06 +0.06% (BM_ZFlat_7 1/cputime_ns) 2.68e-06 -0.61% (BM_ZFlat_8 1/cputime_ns) 8.852e-07 +0.19% (BM_ZFlat_9 1/cputime_ns) 6.441e-07 +1.06% geometric mean +1.62%
2016-06-27 05:01:31 -07:00 · 2016-06-27 05:01:31 -07:00 · 094c67de88
parent fce661fa8c
commit 094c67de88
1 changed files with 21 additions and 17 deletions
--- a/snappy.cc
+++ b/snappy.cc
@ -157,26 +157,30 @@ static inline char* EmitLiteral(char* op,
                                const char* literal,
                                int len,
                                bool allow_fast_path) {
-  int n = len - 1;      // Zero-length literals are disallowed
-  if (n < 60) {
+  // The vast majority of copies are below 16 bytes, for which a
+  // call to memcpy is overkill. This fast path can sometimes
+  // copy up to 15 bytes too much, but that is okay in the
+  // main loop, since we have a bit to go on for both sides:
+  //
+  //   - The input will always have kInputMarginBytes = 15 extra
+  //     available bytes, as long as we're in the main loop, and
+  //     if not, allow_fast_path = false.
+  //   - The output will always have 32 spare bytes (see
+  //     MaxCompressedLength).
+  assert(len > 0);      // Zero-length literals are disallowed
+  int n = len - 1;
+  if (allow_fast_path && len <= 16) {
    // Fits in tag byte
    *op++ = LITERAL | (n << 2);

-    // The vast majority of copies are below 16 bytes, for which a
-    // call to memcpy is overkill. This fast path can sometimes
-    // copy up to 15 bytes too much, but that is okay in the
-    // main loop, since we have a bit to go on for both sides:
-    //
-    //   - The input will always have kInputMarginBytes = 15 extra
-    //     available bytes, as long as we're in the main loop, and
-    //     if not, allow_fast_path = false.
-    //   - The output will always have 32 spare bytes (see
-    //     MaxCompressedLength).
-    if (allow_fast_path && len <= 16) {
-      UnalignedCopy64(literal, op);
-      UnalignedCopy64(literal + 8, op + 8);
-      return op + len;
-    }
+    UnalignedCopy64(literal, op);
+    UnalignedCopy64(literal + 8, op + 8);
+    return op + len;
+  }
+
+  if (n < 60) {
+    // Fits in tag byte
+    *op++ = LITERAL | (n << 2);
  } else {
    // Encode in upcoming bytes
    char* base = op;