mirror of https://github.com/google/snappy.git
Improve zippy with 5-10%.
BM_ZCord/0 [html ] 1.26GB/s ± 0% 1.35GB/s ± 0% +7.90% (p=0.008 n=5+5) BM_ZCord/1 [urls ] 535MB/s ± 0% 562MB/s ± 0% +5.05% (p=0.008 n=5+5) BM_ZCord/2 [jpg ] 10.2GB/s ± 1% 10.2GB/s ± 0% ~ (p=0.310 n=5+5) BM_ZCord/3 [jpg_200] 841MB/s ± 1% 846MB/s ± 1% ~ (p=0.421 n=5+5) BM_ZCord/4 [pdf ] 6.77GB/s ± 1% 7.06GB/s ± 1% +4.28% (p=0.008 n=5+5) BM_ZCord/5 [html4 ] 1.00GB/s ± 0% 1.08GB/s ± 0% +7.94% (p=0.008 n=5+5) BM_ZCord/6 [txt1 ] 391MB/s ± 0% 417MB/s ± 0% +6.71% (p=0.008 n=5+5) BM_ZCord/7 [txt2 ] 363MB/s ± 0% 388MB/s ± 0% +6.73% (p=0.016 n=5+4) BM_ZCord/8 [txt3 ] 400MB/s ± 0% 426MB/s ± 0% +6.55% (p=0.008 n=5+5) BM_ZCord/9 [txt4 ] 328MB/s ± 0% 350MB/s ± 0% +6.66% (p=0.008 n=5+5) BM_ZCord/10 [pb ] 1.67GB/s ± 1% 1.80GB/s ± 0% +7.52% (p=0.008 n=5+5) 1) A key bottleneck in the data dependency chain is figuring out how many bytes are matched and loading the data for next hash value. The load-to-use latency is 5 cycles, in previous cl/303353110 we removed the load in lieu of "shrd" to align previous loads. Unfortunately "shrd" itself has a latency of 4 cycles, we'd prefer "shrx" which takes 1 cycle for variable shifts. 2)Maximally use data already computed. The above trick calculates 5 bytes of useful data. So in case we need to search for new match we can use this for the first search (which is one byte further). PiperOrigin-RevId: 303875535
This commit is contained in:
parent
4dfcad9f4e
commit
d674348a0c
|
@ -89,7 +89,7 @@ char* CompressFragment(const char* input,
|
|||
// Does not read *(s1 + (s2_limit - s2)) or beyond.
|
||||
// Requires that s2_limit >= s2.
|
||||
//
|
||||
// In addition populate *data with the next 8 bytes from the end of the match.
|
||||
// In addition populate *data with the next 5 bytes from the end of the match.
|
||||
// This is only done if 8 bytes are available (s2_limit - s2 >= 8). The point is
|
||||
// that on some arch's this can be done faster in this routine than subsequent
|
||||
// loading from s2 + n.
|
||||
|
@ -113,23 +113,65 @@ static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
|
|||
uint64 a1 = UNALIGNED_LOAD64(s1);
|
||||
uint64 a2 = UNALIGNED_LOAD64(s2);
|
||||
if (SNAPPY_PREDICT_TRUE(a1 != a2)) {
|
||||
// This code is critical for performance. The reason is that it determines
|
||||
// how much to advance `ip` (s2). This obviously depends on both the loads
|
||||
// from the `candidate` (s1) and `ip`. Furthermore the next `candidate`
|
||||
// depends on the advanced `ip` calculated here through a load, hash and
|
||||
// new candidate hash lookup (a lot of cycles). This makes s1 (ie.
|
||||
// `candidate`) the variable that limits throughput. This is the reason we
|
||||
// go through hoops to have this function update `data` for the next iter.
|
||||
// The straightforward code would use *data, given by
|
||||
//
|
||||
// *data = UNALIGNED_LOAD64(s2 + matched_bytes) (Latency of 5 cycles),
|
||||
//
|
||||
// as input for the hash table lookup to find next candidate. However
|
||||
// this forces the load on the data dependency chain of s1, because
|
||||
// matched_bytes directly depends on s1. However matched_bytes is 0..7, so
|
||||
// we can also calculate *data by
|
||||
//
|
||||
// *data = AlignRight(UNALIGNED_LOAD64(s2), UNALIGNED_LOAD64(s2 + 8),
|
||||
// matched_bytes);
|
||||
//
|
||||
// The loads do not depend on s1 anymore and are thus off the bottleneck.
|
||||
// The straightforward implementation on x86_64 would be to use
|
||||
//
|
||||
// shrd rax, rdx, cl (cl being matched_bytes * 8)
|
||||
//
|
||||
// unfortunately shrd with a variable shift has a 4 cycle latency. So this
|
||||
// only wins 1 cycle. The BMI2 shrx instruction is a 1 cycle variable
|
||||
// shift instruction but can only shift 64 bits. If we focus on just
|
||||
// obtaining the least significant 4 bytes, we can obtain this by
|
||||
//
|
||||
// *data = ConditionalMove(matched_bytes < 4, UNALIGNED_LOAD64(s2),
|
||||
// UNALIGNED_LOAD64(s2 + 4) >> ((matched_bytes & 3) * 8);
|
||||
//
|
||||
// Writen like above this is not a big win, the conditional move would be
|
||||
// a cmp followed by a cmov (2 cycles) followed by a shift (1 cycle).
|
||||
// However matched_bytes < 4 is equal to static_cast<uint32>(xorval) != 0.
|
||||
// Writen that way the conditional move (2 cycles) can execute parallel
|
||||
// with FindLSBSetNonZero64 (tzcnt), which takes 3 cycles.
|
||||
uint64 xorval = a1 ^ a2;
|
||||
int shift = Bits::FindLSBSetNonZero64(xorval);
|
||||
size_t matched_bytes = shift >> 3;
|
||||
#ifndef __x86_64__
|
||||
*data = UNALIGNED_LOAD64(s2 + matched_bytes);
|
||||
#else
|
||||
// Unfortunately the compiler cannot find this using the obvious c++ code
|
||||
// *data = shift == 0 ? a2 : (a2 >> shift) | (a3 << (64 - shift);
|
||||
// the reason is that the above needs the conditional clause to guard
|
||||
// against UB when shift == 0. The compiler doesn't realize the full
|
||||
// expression can be lowered into a single "shrd" instruction and in
|
||||
// effect the conditional can be ignored.
|
||||
uint64 a3 = UNALIGNED_LOAD64(s2 + 8);
|
||||
asm ("shrdq %%cl, %1, %0\n\t" : "+r"(a2) : "r"(a3), "c"(shift & -8));
|
||||
*data = a2;
|
||||
// Ideally this would just be
|
||||
//
|
||||
// a2 = static_cast<uint32>(xorval) == 0 ? a3 : a2;
|
||||
//
|
||||
// However clang correctly infers that the above statement participates on
|
||||
// a critical data dependency chain and thus, unfortunately, refuses to
|
||||
// use a conditional move (it's tuned to cut data dependencies). In this
|
||||
// case there is a longer parallel chain anyway AND this will be fairly
|
||||
// unpredictable.
|
||||
uint64 a3 = UNALIGNED_LOAD64(s2 + 4);
|
||||
asm("testl %k2, %k2\n\t"
|
||||
"cmovzq %1, %0\n\t"
|
||||
: "+r"(a2)
|
||||
: "r"(a3), "r"(xorval));
|
||||
*data = a2 >> (shift & (3 * 8));
|
||||
#endif
|
||||
assert(*data == UNALIGNED_LOAD64(s2 + matched_bytes));
|
||||
return std::pair<size_t, bool>(matched_bytes, true);
|
||||
} else {
|
||||
matched = 8;
|
||||
|
@ -154,11 +196,13 @@ static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
|
|||
#ifndef __x86_64__
|
||||
*data = UNALIGNED_LOAD64(s2 + matched_bytes);
|
||||
#else
|
||||
uint64 a3 = UNALIGNED_LOAD64(s2 + 8);
|
||||
asm("shrdq %%cl, %1, %0\n\t" : "+r"(a2) : "r"(a3), "c"(shift & -8));
|
||||
*data = a2;
|
||||
uint64 a3 = UNALIGNED_LOAD64(s2 + 4);
|
||||
asm("testl %k2, %k2\n\t"
|
||||
"cmovzq %1, %0\n\t"
|
||||
: "+r"(a2)
|
||||
: "r"(a3), "r"(xorval));
|
||||
*data = a2 >> (shift & (3 * 8));
|
||||
#endif
|
||||
assert(*data == UNALIGNED_LOAD64(s2 + matched_bytes));
|
||||
matched += matched_bytes;
|
||||
assert(matched >= 8);
|
||||
return std::pair<size_t, bool>(matched, false);
|
||||
|
|
47
snappy.cc
47
snappy.cc
|
@ -516,16 +516,16 @@ char* CompressFragment(const char* input,
|
|||
assert(static_cast<int>(kuint32max >> shift) == table_size - 1);
|
||||
const char* ip_end = input + input_size;
|
||||
const char* base_ip = ip;
|
||||
// Bytes in [next_emit, ip) will be emitted as literal bytes. Or
|
||||
// [next_emit, ip_end) after the main loop.
|
||||
const char* next_emit = ip;
|
||||
|
||||
const size_t kInputMarginBytes = 15;
|
||||
if (SNAPPY_PREDICT_TRUE(input_size >= kInputMarginBytes)) {
|
||||
const char* ip_limit = input + input_size - kInputMarginBytes;
|
||||
|
||||
for (uint64 data = LittleEndian::Load64(++ip);;) {
|
||||
assert(next_emit < ip);
|
||||
for (uint32 preload = LittleEndian::Load32(ip + 1);;) {
|
||||
// Bytes in [next_emit, ip) will be emitted as literal bytes. Or
|
||||
// [next_emit, ip_end) after the main loop.
|
||||
const char* next_emit = ip++;
|
||||
uint64 data = LittleEndian::Load64(ip);
|
||||
// The body of this loop calls EmitLiteral once and then EmitCopy one or
|
||||
// more times. (The exception is that when we're close to exhausting
|
||||
// the input we goto emit_remainder.)
|
||||
|
@ -559,14 +559,17 @@ char* CompressFragment(const char* input,
|
|||
for (int j = 0; j < 4; j++) {
|
||||
for (int k = 0; k < 4; k++) {
|
||||
int i = 4 * j + k;
|
||||
assert(static_cast<uint32>(data) == LittleEndian::Load32(ip + i));
|
||||
uint32 hash = HashBytes(data, shift);
|
||||
// These for-loops are meant to be unrolled. So we can freely
|
||||
// special case the first iteration to use the value already
|
||||
// loaded in preload.
|
||||
uint32 dword = i == 0 ? preload : data;
|
||||
assert(dword == LittleEndian::Load32(ip + i));
|
||||
uint32 hash = HashBytes(dword, shift);
|
||||
candidate = base_ip + table[hash];
|
||||
assert(candidate >= base_ip);
|
||||
assert(candidate < ip + i);
|
||||
table[hash] = delta + i;
|
||||
if (SNAPPY_PREDICT_FALSE(LittleEndian::Load32(candidate) ==
|
||||
static_cast<uint32>(data))) {
|
||||
if (SNAPPY_PREDICT_FALSE(LittleEndian::Load32(candidate) == dword)) {
|
||||
*op = LITERAL | (i << 2);
|
||||
UnalignedCopy128(next_emit, op + 1);
|
||||
ip += i;
|
||||
|
@ -587,6 +590,7 @@ char* CompressFragment(const char* input,
|
|||
skip += bytes_between_hash_lookups;
|
||||
const char* next_ip = ip + bytes_between_hash_lookups;
|
||||
if (SNAPPY_PREDICT_FALSE(next_ip > ip_limit)) {
|
||||
ip = next_emit;
|
||||
goto emit_remainder;
|
||||
}
|
||||
candidate = base_ip + table[hash];
|
||||
|
@ -632,11 +636,12 @@ char* CompressFragment(const char* input,
|
|||
} else {
|
||||
op = EmitCopy</*len_less_than_12=*/false>(op, offset, matched);
|
||||
}
|
||||
next_emit = ip;
|
||||
if (SNAPPY_PREDICT_FALSE(ip >= ip_limit)) {
|
||||
goto emit_remainder;
|
||||
}
|
||||
assert(LittleEndian::Load64(ip) == data);
|
||||
// Expect 5 bytes to match
|
||||
assert((data & 0xFFFFFFFFFF) ==
|
||||
(LittleEndian::Load64(ip) & 0xFFFFFFFFFF));
|
||||
// We are now looking for a 4-byte match again. We read
|
||||
// table[Hash(ip, shift)] for that. To improve compression,
|
||||
// we also update table[Hash(ip - 1, shift)] and table[Hash(ip, shift)].
|
||||
|
@ -645,17 +650,27 @@ char* CompressFragment(const char* input,
|
|||
uint32 hash = HashBytes(data, shift);
|
||||
candidate = base_ip + table[hash];
|
||||
table[hash] = ip - base_ip;
|
||||
// Measurements on the benchmarks have shown the following probabilities
|
||||
// for the loop to exit (ie. avg. number of iterations is reciprocal).
|
||||
// BM_Flat/6 txt1 p = 0.3-0.4
|
||||
// BM_Flat/7 txt2 p = 0.35
|
||||
// BM_Flat/8 txt3 p = 0.3-0.4
|
||||
// BM_Flat/9 txt3 p = 0.34-0.4
|
||||
// BM_Flat/10 pb p = 0.4
|
||||
// BM_Flat/11 gaviota p = 0.1
|
||||
// BM_Flat/12 cp p = 0.5
|
||||
// BM_Flat/13 c p = 0.3
|
||||
} while (static_cast<uint32>(data) == LittleEndian::Load32(candidate));
|
||||
++ip;
|
||||
data = LittleEndian::Load64(ip);
|
||||
// Because the least significant 5 bytes matched, we can utilize data
|
||||
// for the next iteration.
|
||||
preload = data >> 8;
|
||||
}
|
||||
}
|
||||
|
||||
emit_remainder:
|
||||
// Emit the remaining bytes as a literal
|
||||
if (next_emit < ip_end) {
|
||||
op = EmitLiteral</*allow_fast_path=*/false>(op, next_emit,
|
||||
ip_end - next_emit);
|
||||
if (ip < ip_end) {
|
||||
op = EmitLiteral</*allow_fast_path=*/false>(op, ip, ip_end - ip);
|
||||
}
|
||||
|
||||
return op;
|
||||
|
|
Loading…
Reference in New Issue