Re-work fast path that emits copies in zippy compression.

The primary motivation for the change is that FindMatchLength is
likely to discover a difference in the first 8 bytes it compares.
If that occurs then we know the length of the match is less than 12,
because FindMatchLength is invoked after a 4-byte match is found.
When emitting a copy, it is useful to know that the length is less
than 12 because the two-byte variant of an emitted copy requires that.

This is a performance-tuning change that should not affect the
library's behavior.

With FDO on perflab/Haswell the geometric mean for ZFlat/* went from
47,290ns to 45,741ns, an improvement of 3.4%.

SAMPLE (before)

BM_ZFlat/0      102824     102650      40691 951.4MB/s  html (22.31 %)
BM_ZFlat/1     1293512    1290442       3225 518.9MB/s  urls (47.78 %)
BM_ZFlat/2       10373      10353     417959 11.1GB/s  jpg (99.95 %)
BM_ZFlat/3         268        268   15745324 712.4MB/s  jpg_200 (73.00 %)
BM_ZFlat/4       12137      12113     342462 7.9GB/s  pdf (83.30 %)
BM_ZFlat/5      430672     429720       9724 909.0MB/s  html4 (22.52 %)
BM_ZFlat/6      420541     419636       9833 345.6MB/s  txt1 (57.88 %)
BM_ZFlat/7      373829     373158      10000 319.9MB/s  txt2 (61.91 %)
BM_ZFlat/8     1119014    1116604       3755 364.5MB/s  txt3 (54.99 %)
BM_ZFlat/9     1544203    1540657       2748 298.3MB/s  txt4 (66.26 %)
BM_ZFlat/10      91041      90866      46002 1.2GB/s  pb (19.68 %)
BM_ZFlat/11     332766     331990      10000 529.5MB/s  gaviota (37.72 %)
BM_ZFlat/12      39960      39886     100000 588.3MB/s  cp (48.12 %)
BM_ZFlat/13      14493      14465     287181 735.1MB/s  c (42.47 %)
BM_ZFlat/14       4447       4440     947927 799.3MB/s  lsp (48.37 %)
BM_ZFlat/15    1316362    1313350       3196 747.7MB/s  xls (41.23 %)
BM_ZFlat/16        312        311   10000000 613.0MB/s  xls_200 (78.00 %)
BM_ZFlat/17     388471     387502      10000 1.2GB/s  bin (18.11 %)
BM_ZFlat/18         65         64   64838208 2.9GB/s  bin_200 (7.50 %)
BM_ZFlat/19      65900      65787      63099 554.3MB/s  sum (48.96 %)
BM_ZFlat/20       6188       6177     681951 652.6MB/s  man (59.21 %)

SAMPLE (after)

Benchmark     Time(ns)    CPU(ns) Iterations
--------------------------------------------
BM_ZFlat/0       99259      99044      42428 986.0MB/s  html (22.31 %)
BM_ZFlat/1     1257039    1255276       3341 533.4MB/s  urls (47.78 %)
BM_ZFlat/2       10044      10030     405781 11.4GB/s  jpg (99.95 %)
BM_ZFlat/3         268        267   15732282 713.3MB/s  jpg_200 (73.00 %)
BM_ZFlat/4       11675      11657     358629 8.2GB/s  pdf (83.30 %)
BM_ZFlat/5      420951     419818       9739 930.5MB/s  html4 (22.52 %)
BM_ZFlat/6      415460     414632      10000 349.8MB/s  txt1 (57.88 %)
BM_ZFlat/7      367191     366436      10000 325.8MB/s  txt2 (61.91 %)
BM_ZFlat/8     1098345    1096036       3819 371.3MB/s  txt3 (54.99 %)
BM_ZFlat/9     1508701    1505306       2758 305.3MB/s  txt4 (66.26 %)
BM_ZFlat/10      87195      87031      47289 1.3GB/s  pb (19.68 %)
BM_ZFlat/11     322338     321637      10000 546.5MB/s  gaviota (37.72 %)
BM_ZFlat/12      36739      36668     100000 639.9MB/s  cp (48.12 %)
BM_ZFlat/13      13646      13618     304009 780.9MB/s  c (42.47 %)
BM_ZFlat/14       4249       4240     992456 837.0MB/s  lsp (48.37 %)
BM_ZFlat/15    1262925    1260012       3314 779.4MB/s  xls (41.23 %)
BM_ZFlat/16        308        308   10000000 619.8MB/s  xls_200 (78.00 %)
BM_ZFlat/17     379750     378944      10000 1.3GB/s  bin (18.11 %)
BM_ZFlat/18         62         62   67443280 3.0GB/s  bin_200 (7.50 %)
BM_ZFlat/19      61706      61587      67645 592.1MB/s  sum (48.96 %)
BM_ZFlat/20       5968       5958     698974 676.6MB/s  man (59.21 %)
This commit is contained in:
Geoff Pike 2016-06-28 11:53:11 -07:00 committed by Alkis Evlogimenos
parent 094c67de88
commit 38a5ec5fca
3 changed files with 76 additions and 49 deletions

View File

@ -70,11 +70,12 @@ char* CompressFragment(const char* input,
uint16* table,
const int table_size);
// Return the largest n such that
// Find the largest n such that
//
// s1[0,n-1] == s2[0,n-1]
// and n <= (s2_limit - s2).
//
// Return make_pair(n, n < 8).
// Does not read *s2_limit or beyond.
// Does not read *(s1 + (s2_limit - s2)) or beyond.
// Requires that s2_limit >= s2.
@ -82,11 +83,27 @@ char* CompressFragment(const char* input,
// Separate implementation for x86_64, for speed. Uses the fact that
// x86_64 is little endian.
#if defined(ARCH_K8)
static inline int FindMatchLength(const char* s1,
static inline pair<size_t, bool> FindMatchLength(const char* s1,
const char* s2,
const char* s2_limit) {
assert(s2_limit >= s2);
int matched = 0;
size_t matched = 0;
// This block isn't necessary for correctness; we could just start looping
// immediately. As an optimization though, it is useful. It creates some not
// uncommon code paths that determine, without extra effort, whether the match
// length is less than 8. In short, we are hoping to avoid a conditional
// branch, and perhaps get better code layout from the C++ compiler.
if (PREDICT_TRUE(s2 <= s2_limit - 8)) {
uint64 a1 = UNALIGNED_LOAD64(s1);
uint64 a2 = UNALIGNED_LOAD64(s2);
if (a1 != a2) {
return pair<size_t, bool>(Bits::FindLSBSetNonZero64(a1 ^ a2) >> 3, true);
} else {
matched = 8;
s2 += 8;
}
}
// Find out how long the match is. We loop over the data 64 bits at a
// time until we find a 64-bit block that doesn't match; then we find
@ -97,14 +114,11 @@ static inline int FindMatchLength(const char* s1,
s2 += 8;
matched += 8;
} else {
// On current (mid-2008) Opteron models there is a 3% more
// efficient code sequence to find the first non-matching byte.
// However, what follows is ~10% better on Intel Core 2 and newer,
// and we expect AMD's bsf instruction to improve.
uint64 x = UNALIGNED_LOAD64(s2) ^ UNALIGNED_LOAD64(s1 + matched);
int matching_bits = Bits::FindLSBSetNonZero64(x);
matched += matching_bits >> 3;
return matched;
assert(matched >= 8);
return pair<size_t, bool>(matched, false);
}
}
while (PREDICT_TRUE(s2 < s2_limit)) {
@ -112,13 +126,13 @@ static inline int FindMatchLength(const char* s1,
++s2;
++matched;
} else {
return matched;
return pair<size_t, bool>(matched, matched < 8);
}
}
return matched;
return pair<size_t, bool>(matched, matched < 8);
}
#else
static inline int FindMatchLength(const char* s1,
static inline pair<size_t, bool> FindMatchLength(const char* s1,
const char* s2,
const char* s2_limit) {
// Implementation based on the x86-64 version, above.
@ -140,7 +154,7 @@ static inline int FindMatchLength(const char* s1,
++matched;
}
}
return matched;
return pair<size_t, bool>(matched, matched < 8);
}
#endif

View File

@ -41,7 +41,6 @@ namespace snappy {
using internal::COPY_1_BYTE_OFFSET;
using internal::COPY_2_BYTE_OFFSET;
using internal::COPY_4_BYTE_OFFSET;
using internal::LITERAL;
using internal::char_table;
using internal::kMaximumTagLength;
@ -199,42 +198,54 @@ static inline char* EmitLiteral(char* op,
return op + len;
}
static inline char* EmitCopyLessThan64(char* op, size_t offset, int len) {
static inline char* EmitCopyAtMost64(char* op, size_t offset, size_t len,
bool len_less_than_12) {
assert(len <= 64);
assert(len >= 4);
assert(offset < 65536);
assert(len_less_than_12 == (len < 12));
if ((len < 12) && (offset < 2048)) {
size_t len_minus_4 = len - 4;
assert(len_minus_4 < 8); // Must fit in 3 bits
*op++ = COPY_1_BYTE_OFFSET + ((len_minus_4) << 2) + ((offset >> 8) << 5);
if (len_less_than_12 && PREDICT_TRUE(offset < 2048)) {
// offset fits in 11 bits. The 3 highest go in the top of the first byte,
// and the rest go in the second byte.
*op++ = COPY_1_BYTE_OFFSET + ((len - 4) << 2) + ((offset >> 3) & 0xe0);
*op++ = offset & 0xff;
} else {
*op++ = COPY_2_BYTE_OFFSET + ((len-1) << 2);
LittleEndian::Store16(op, offset);
op += 2;
// Write 4 bytes, though we only care about 3 of them. The output buffer
// is required to have some slack, so the extra byte won't overrun it.
uint32 u = COPY_2_BYTE_OFFSET + ((len - 1) << 2) + (offset << 8);
LittleEndian::Store32(op, u);
op += 3;
}
return op;
}
static inline char* EmitCopy(char* op, size_t offset, int len) {
// Emit 64 byte copies but make sure to keep at least four bytes reserved
static inline char* EmitCopy(char* op, size_t offset, size_t len,
bool len_less_than_12) {
assert(len_less_than_12 == (len < 12));
if (len_less_than_12) {
return EmitCopyAtMost64(op, offset, len, true);
} else {
// A special case for len <= 64 might help, but so far measurements suggest
// it's in the noise.
// Emit 64 byte copies but make sure to keep at least four bytes reserved.
while (PREDICT_FALSE(len >= 68)) {
op = EmitCopyLessThan64(op, offset, 64);
op = EmitCopyAtMost64(op, offset, 64, false);
len -= 64;
}
// Emit an extra 60 byte copy if have too much data to fit in one copy
// One or two copies will now finish the job.
if (len > 64) {
op = EmitCopyLessThan64(op, offset, 60);
op = EmitCopyAtMost64(op, offset, 60, false);
len -= 60;
}
// Emit remainder
op = EmitCopyLessThan64(op, offset, len);
// Emit remainder.
op = EmitCopyAtMost64(op, offset, len, len < 12);
return op;
}
}
bool GetUncompressedLength(const char* start, size_t n, size_t* result) {
uint32 v = 0;
@ -422,19 +433,20 @@ char* CompressFragment(const char* input,
// We have a 4-byte match at ip, and no need to emit any
// "literal bytes" prior to ip.
const char* base = ip;
int matched = 4 + FindMatchLength(candidate + 4, ip + 4, ip_end);
pair<size_t, bool> p = FindMatchLength(candidate + 4, ip + 4, ip_end);
size_t matched = 4 + p.first;
ip += matched;
size_t offset = base - candidate;
assert(0 == memcmp(base, candidate, matched));
op = EmitCopy(op, offset, matched);
// We could immediately start working at ip now, but to improve
// compression we first update table[Hash(ip - 1, ...)].
const char* insert_tail = ip - 1;
op = EmitCopy(op, offset, matched, p.second);
next_emit = ip;
if (PREDICT_FALSE(ip >= ip_limit)) {
goto emit_remainder;
}
input_bytes = GetEightBytesAt(insert_tail);
// We are now looking for a 4-byte match again. We read
// table[Hash(ip, shift)] for that. To improve compression,
// we also update table[Hash(ip - 1, shift)] and table[Hash(ip, shift)].
input_bytes = GetEightBytesAt(ip - 1);
uint32 prev_hash = HashBytes(GetUint32AtOffset(input_bytes, 0), shift);
table[prev_hash] = ip - base_ip - 1;
uint32 cur_hash = HashBytes(GetUint32AtOffset(input_bytes, 1), shift);

View File

@ -1054,7 +1054,9 @@ TEST(Snappy, ZeroOffsetCopyValidation) {
namespace {
int TestFindMatchLength(const char* s1, const char *s2, unsigned length) {
return snappy::internal::FindMatchLength(s1, s2, s2 + length);
pair<size_t, bool> p = snappy::internal::FindMatchLength(s1, s2, s2 + length);
CHECK_EQ(p.first < 8, p.second);
return p.first;
}
} // namespace
@ -1164,8 +1166,7 @@ TEST(Snappy, FindMatchLengthRandom) {
}
DataEndingAtUnreadablePage u(s);
DataEndingAtUnreadablePage v(t);
int matched = snappy::internal::FindMatchLength(
u.data(), v.data(), v.data() + t.size());
int matched = TestFindMatchLength(u.data(), v.data(), t.size());
if (matched == t.size()) {
EXPECT_EQ(s, t);
} else {