Speed up the EmitLiteral fast path, +1.62% for ZFlat benchmarks.

This is inspired by the Go version in
//third_party/golang/snappy/encode_amd64.s (emitLiteralFastPath)

        Benchmark         Base:Reference   (1)
--------------------------------------------------
(BM_ZFlat_0 1/cputime_ns)        9.669e-06  +1.65%
(BM_ZFlat_1 1/cputime_ns)        7.643e-07  +2.53%
(BM_ZFlat_10 1/cputime_ns)       1.107e-05  -0.97%
(BM_ZFlat_11 1/cputime_ns)       3.002e-06  +0.71%
(BM_ZFlat_12 1/cputime_ns)       2.338e-05  +7.22%
(BM_ZFlat_13 1/cputime_ns)       6.386e-05  +9.18%
(BM_ZFlat_14 1/cputime_ns)       0.0002256  -0.05%
(BM_ZFlat_15 1/cputime_ns)       7.608e-07  -1.29%
(BM_ZFlat_16 1/cputime_ns)        0.003236  -1.28%
(BM_ZFlat_17 1/cputime_ns)        2.58e-06  +0.52%
(BM_ZFlat_18 1/cputime_ns)         0.01538  +0.00%
(BM_ZFlat_19 1/cputime_ns)       1.436e-05  +6.21%
(BM_ZFlat_2 1/cputime_ns)        0.0001044  +4.99%
(BM_ZFlat_20 1/cputime_ns)       0.0001608  -0.18%
(BM_ZFlat_3 1/cputime_ns)         0.003745  +0.38%
(BM_ZFlat_4 1/cputime_ns)        8.144e-05  +6.21%
(BM_ZFlat_5 1/cputime_ns)        2.328e-06  -1.60%
(BM_ZFlat_6 1/cputime_ns)        2.391e-06  +0.06%
(BM_ZFlat_7 1/cputime_ns)         2.68e-06  -0.61%
(BM_ZFlat_8 1/cputime_ns)        8.852e-07  +0.19%
(BM_ZFlat_9 1/cputime_ns)        6.441e-07  +1.06%

geometric mean                              +1.62%
This commit is contained in:
ckennelly 2016-06-27 05:01:31 -07:00 committed by Alkis Evlogimenos
parent fce661fa8c
commit 094c67de88
1 changed files with 21 additions and 17 deletions

View File

@ -157,26 +157,30 @@ static inline char* EmitLiteral(char* op,
const char* literal, const char* literal,
int len, int len,
bool allow_fast_path) { bool allow_fast_path) {
int n = len - 1; // Zero-length literals are disallowed // The vast majority of copies are below 16 bytes, for which a
if (n < 60) { // call to memcpy is overkill. This fast path can sometimes
// copy up to 15 bytes too much, but that is okay in the
// main loop, since we have a bit to go on for both sides:
//
// - The input will always have kInputMarginBytes = 15 extra
// available bytes, as long as we're in the main loop, and
// if not, allow_fast_path = false.
// - The output will always have 32 spare bytes (see
// MaxCompressedLength).
assert(len > 0); // Zero-length literals are disallowed
int n = len - 1;
if (allow_fast_path && len <= 16) {
// Fits in tag byte // Fits in tag byte
*op++ = LITERAL | (n << 2); *op++ = LITERAL | (n << 2);
// The vast majority of copies are below 16 bytes, for which a UnalignedCopy64(literal, op);
// call to memcpy is overkill. This fast path can sometimes UnalignedCopy64(literal + 8, op + 8);
// copy up to 15 bytes too much, but that is okay in the return op + len;
// main loop, since we have a bit to go on for both sides: }
//
// - The input will always have kInputMarginBytes = 15 extra if (n < 60) {
// available bytes, as long as we're in the main loop, and // Fits in tag byte
// if not, allow_fast_path = false. *op++ = LITERAL | (n << 2);
// - The output will always have 32 spare bytes (see
// MaxCompressedLength).
if (allow_fast_path && len <= 16) {
UnalignedCopy64(literal, op);
UnalignedCopy64(literal + 8, op + 8);
return op + len;
}
} else { } else {
// Encode in upcoming bytes // Encode in upcoming bytes
char* base = op; char* base = op;