mirror of https://github.com/google/snappy.git
Speed up the EmitLiteral fast path, +1.62% for ZFlat benchmarks.
This is inspired by the Go version in //third_party/golang/snappy/encode_amd64.s (emitLiteralFastPath) Benchmark Base:Reference (1) -------------------------------------------------- (BM_ZFlat_0 1/cputime_ns) 9.669e-06 +1.65% (BM_ZFlat_1 1/cputime_ns) 7.643e-07 +2.53% (BM_ZFlat_10 1/cputime_ns) 1.107e-05 -0.97% (BM_ZFlat_11 1/cputime_ns) 3.002e-06 +0.71% (BM_ZFlat_12 1/cputime_ns) 2.338e-05 +7.22% (BM_ZFlat_13 1/cputime_ns) 6.386e-05 +9.18% (BM_ZFlat_14 1/cputime_ns) 0.0002256 -0.05% (BM_ZFlat_15 1/cputime_ns) 7.608e-07 -1.29% (BM_ZFlat_16 1/cputime_ns) 0.003236 -1.28% (BM_ZFlat_17 1/cputime_ns) 2.58e-06 +0.52% (BM_ZFlat_18 1/cputime_ns) 0.01538 +0.00% (BM_ZFlat_19 1/cputime_ns) 1.436e-05 +6.21% (BM_ZFlat_2 1/cputime_ns) 0.0001044 +4.99% (BM_ZFlat_20 1/cputime_ns) 0.0001608 -0.18% (BM_ZFlat_3 1/cputime_ns) 0.003745 +0.38% (BM_ZFlat_4 1/cputime_ns) 8.144e-05 +6.21% (BM_ZFlat_5 1/cputime_ns) 2.328e-06 -1.60% (BM_ZFlat_6 1/cputime_ns) 2.391e-06 +0.06% (BM_ZFlat_7 1/cputime_ns) 2.68e-06 -0.61% (BM_ZFlat_8 1/cputime_ns) 8.852e-07 +0.19% (BM_ZFlat_9 1/cputime_ns) 6.441e-07 +1.06% geometric mean +1.62%
This commit is contained in:
parent
fce661fa8c
commit
094c67de88
38
snappy.cc
38
snappy.cc
|
@ -157,26 +157,30 @@ static inline char* EmitLiteral(char* op,
|
||||||
const char* literal,
|
const char* literal,
|
||||||
int len,
|
int len,
|
||||||
bool allow_fast_path) {
|
bool allow_fast_path) {
|
||||||
int n = len - 1; // Zero-length literals are disallowed
|
// The vast majority of copies are below 16 bytes, for which a
|
||||||
if (n < 60) {
|
// call to memcpy is overkill. This fast path can sometimes
|
||||||
|
// copy up to 15 bytes too much, but that is okay in the
|
||||||
|
// main loop, since we have a bit to go on for both sides:
|
||||||
|
//
|
||||||
|
// - The input will always have kInputMarginBytes = 15 extra
|
||||||
|
// available bytes, as long as we're in the main loop, and
|
||||||
|
// if not, allow_fast_path = false.
|
||||||
|
// - The output will always have 32 spare bytes (see
|
||||||
|
// MaxCompressedLength).
|
||||||
|
assert(len > 0); // Zero-length literals are disallowed
|
||||||
|
int n = len - 1;
|
||||||
|
if (allow_fast_path && len <= 16) {
|
||||||
// Fits in tag byte
|
// Fits in tag byte
|
||||||
*op++ = LITERAL | (n << 2);
|
*op++ = LITERAL | (n << 2);
|
||||||
|
|
||||||
// The vast majority of copies are below 16 bytes, for which a
|
UnalignedCopy64(literal, op);
|
||||||
// call to memcpy is overkill. This fast path can sometimes
|
UnalignedCopy64(literal + 8, op + 8);
|
||||||
// copy up to 15 bytes too much, but that is okay in the
|
return op + len;
|
||||||
// main loop, since we have a bit to go on for both sides:
|
}
|
||||||
//
|
|
||||||
// - The input will always have kInputMarginBytes = 15 extra
|
if (n < 60) {
|
||||||
// available bytes, as long as we're in the main loop, and
|
// Fits in tag byte
|
||||||
// if not, allow_fast_path = false.
|
*op++ = LITERAL | (n << 2);
|
||||||
// - The output will always have 32 spare bytes (see
|
|
||||||
// MaxCompressedLength).
|
|
||||||
if (allow_fast_path && len <= 16) {
|
|
||||||
UnalignedCopy64(literal, op);
|
|
||||||
UnalignedCopy64(literal + 8, op + 8);
|
|
||||||
return op + len;
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
// Encode in upcoming bytes
|
// Encode in upcoming bytes
|
||||||
char* base = op;
|
char* base = op;
|
||||||
|
|
Loading…
Reference in New Issue