From 094c67de88f41eae494a3823f8aaf0f77b25b980 Mon Sep 17 00:00:00 2001 From: ckennelly Date: Mon, 27 Jun 2016 05:01:31 -0700 Subject: [PATCH] Speed up the EmitLiteral fast path, +1.62% for ZFlat benchmarks. This is inspired by the Go version in //third_party/golang/snappy/encode_amd64.s (emitLiteralFastPath) Benchmark Base:Reference (1) -------------------------------------------------- (BM_ZFlat_0 1/cputime_ns) 9.669e-06 +1.65% (BM_ZFlat_1 1/cputime_ns) 7.643e-07 +2.53% (BM_ZFlat_10 1/cputime_ns) 1.107e-05 -0.97% (BM_ZFlat_11 1/cputime_ns) 3.002e-06 +0.71% (BM_ZFlat_12 1/cputime_ns) 2.338e-05 +7.22% (BM_ZFlat_13 1/cputime_ns) 6.386e-05 +9.18% (BM_ZFlat_14 1/cputime_ns) 0.0002256 -0.05% (BM_ZFlat_15 1/cputime_ns) 7.608e-07 -1.29% (BM_ZFlat_16 1/cputime_ns) 0.003236 -1.28% (BM_ZFlat_17 1/cputime_ns) 2.58e-06 +0.52% (BM_ZFlat_18 1/cputime_ns) 0.01538 +0.00% (BM_ZFlat_19 1/cputime_ns) 1.436e-05 +6.21% (BM_ZFlat_2 1/cputime_ns) 0.0001044 +4.99% (BM_ZFlat_20 1/cputime_ns) 0.0001608 -0.18% (BM_ZFlat_3 1/cputime_ns) 0.003745 +0.38% (BM_ZFlat_4 1/cputime_ns) 8.144e-05 +6.21% (BM_ZFlat_5 1/cputime_ns) 2.328e-06 -1.60% (BM_ZFlat_6 1/cputime_ns) 2.391e-06 +0.06% (BM_ZFlat_7 1/cputime_ns) 2.68e-06 -0.61% (BM_ZFlat_8 1/cputime_ns) 8.852e-07 +0.19% (BM_ZFlat_9 1/cputime_ns) 6.441e-07 +1.06% geometric mean +1.62% --- snappy.cc | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/snappy.cc b/snappy.cc index 3b9988a..089219c 100644 --- a/snappy.cc +++ b/snappy.cc @@ -157,26 +157,30 @@ static inline char* EmitLiteral(char* op, const char* literal, int len, bool allow_fast_path) { - int n = len - 1; // Zero-length literals are disallowed - if (n < 60) { + // The vast majority of copies are below 16 bytes, for which a + // call to memcpy is overkill. This fast path can sometimes + // copy up to 15 bytes too much, but that is okay in the + // main loop, since we have a bit to go on for both sides: + // + // - The input will always have kInputMarginBytes = 15 extra + // available bytes, as long as we're in the main loop, and + // if not, allow_fast_path = false. + // - The output will always have 32 spare bytes (see + // MaxCompressedLength). + assert(len > 0); // Zero-length literals are disallowed + int n = len - 1; + if (allow_fast_path && len <= 16) { // Fits in tag byte *op++ = LITERAL | (n << 2); - // The vast majority of copies are below 16 bytes, for which a - // call to memcpy is overkill. This fast path can sometimes - // copy up to 15 bytes too much, but that is okay in the - // main loop, since we have a bit to go on for both sides: - // - // - The input will always have kInputMarginBytes = 15 extra - // available bytes, as long as we're in the main loop, and - // if not, allow_fast_path = false. - // - The output will always have 32 spare bytes (see - // MaxCompressedLength). - if (allow_fast_path && len <= 16) { - UnalignedCopy64(literal, op); - UnalignedCopy64(literal + 8, op + 8); - return op + len; - } + UnalignedCopy64(literal, op); + UnalignedCopy64(literal + 8, op + 8); + return op + len; + } + + if (n < 60) { + // Fits in tag byte + *op++ = LITERAL | (n << 2); } else { // Encode in upcoming bytes char* base = op;