Improve the speed of hashing in zippy compression.

This change replaces the hashing function used during compression with
one that is roughly as good but faster.  This speeds up compression by
two to a few percent on the Intel-, AMD-, and Arm-based machines we
tested.  The amount of compression is roughly unchanged.

PiperOrigin-RevId: 485960303
This commit is contained in:
Snappy Team 2022-11-03 20:36:33 +00:00 committed by Victor Costan
parent a2d219a8a8
commit 8881ba172a
3 changed files with 79 additions and 20 deletions

View File

@ -172,6 +172,19 @@ int main() {
return 0; return 0;
}" SNAPPY_HAVE_SSSE3) }" SNAPPY_HAVE_SSSE3)
check_cxx_source_compiles("
#include <immintrin.h>
int main() {
return _mm_crc32_u32(0, 1);
}" SNAPPY_HAVE_X86_CRC32)
check_cxx_source_compiles("
#include <arm_neon.h>
#include <arm_acle.h>
int main() {
return __crc32cw(0, 1);
}" SNAPPY_HAVE_NEON_CRC32)
check_cxx_source_compiles(" check_cxx_source_compiles("
#include <immintrin.h> #include <immintrin.h>
int main() { int main() {

View File

@ -46,12 +46,19 @@
/* Define to 1 if you target processors with SSSE3+ and have <tmmintrin.h>. */ /* Define to 1 if you target processors with SSSE3+ and have <tmmintrin.h>. */
#cmakedefine01 SNAPPY_HAVE_SSSE3 #cmakedefine01 SNAPPY_HAVE_SSSE3
/* Define to 1 if you target processors with SSE4.2 and have <crc32intrin.h>. */
#cmakedefine01 SNAPPY_HAVE_X86_CRC32
/* Define to 1 if you target processors with BMI2+ and have <bmi2intrin.h>. */ /* Define to 1 if you target processors with BMI2+ and have <bmi2intrin.h>. */
#cmakedefine01 SNAPPY_HAVE_BMI2 #cmakedefine01 SNAPPY_HAVE_BMI2
/* Define to 1 if you target processors with NEON and have <arm_neon.h>. */ /* Define to 1 if you target processors with NEON and have <arm_neon.h>. */
#cmakedefine01 SNAPPY_HAVE_NEON #cmakedefine01 SNAPPY_HAVE_NEON
/* Define to 1 if you have <arm_neon.h> and <arm_acle.h> and want to optimize
compression speed by using __crc32cw from <arm_acle.h>. */
#cmakedefine01 SNAPPY_HAVE_NEON_CRC32
/* Define to 1 if your processor stores words with the most significant byte /* Define to 1 if your processor stores words with the most significant byte
first (like Motorola and SPARC, unlike Intel and VAX). */ first (like Motorola and SPARC, unlike Intel and VAX). */
#cmakedefine01 SNAPPY_IS_BIG_ENDIAN #cmakedefine01 SNAPPY_IS_BIG_ENDIAN

View File

@ -45,10 +45,28 @@
#endif #endif
#endif // !defined(SNAPPY_HAVE_BMI2) #endif // !defined(SNAPPY_HAVE_BMI2)
#if SNAPPY_HAVE_BMI2 #if !defined(SNAPPY_HAVE_X86_CRC32)
#if defined(__SSE4_2__)
#define SNAPPY_HAVE_X86_CRC32 1
#else
#define SNAPPY_HAVE_X86_CRC32 0
#endif
#endif // !defined(SNAPPY_HAVE_X86_CRC32)
#if !defined(SNAPPY_HAVE_NEON_CRC32)
#if SNAPPY_HAVE_NEON && defined(__ARM_FEATURE_CRC32)
#define SNAPPY_HAVE_NEON_CRC32 1
#else
#define SNAPPY_HAVE_NEON_CRC32 0
#endif
#endif // !defined(SNAPPY_HAVE_NEON_CRC32)
#if SNAPPY_HAVE_BMI2 || SNAPPY_HAVE_X86_CRC32
// Please do not replace with <x86intrin.h>. or with headers that assume more // Please do not replace with <x86intrin.h>. or with headers that assume more
// advanced SSE versions without checking with all the OWNERS. // advanced SSE versions without checking with all the OWNERS.
#include <immintrin.h> #include <immintrin.h>
#elif SNAPPY_HAVE_NEON_CRC32
#include <arm_acle.h>
#endif #endif
#include <algorithm> #include <algorithm>
@ -127,14 +145,34 @@ constexpr std::array<int16_t, 256> MakeTable(index_sequence<seq...>) {
alignas(64) const std::array<int16_t, 256> kLengthMinusOffset = alignas(64) const std::array<int16_t, 256> kLengthMinusOffset =
MakeTable(make_index_sequence<256>{}); MakeTable(make_index_sequence<256>{});
// Any hash function will produce a valid compressed bitstream, but a good // Given a table of uint16_t whose size is mask / 2 + 1, return a pointer to the
// hash function reduces the number of collisions and thus yields better // relevant entry, if any, for the given bytes. Any hash function will do,
// compression for compressible input, and more speed for incompressible // but a good hash function reduces the number of collisions and thus yields
// input. Of course, it doesn't hurt if the hash function is reasonably fast // better compression for compressible input.
// either, as it gets called a lot. //
inline uint32_t HashBytes(uint32_t bytes, uint32_t mask) { // REQUIRES: mask is 2 * (table_size - 1), and table_size is a power of two.
inline uint16_t* TableEntry(uint16_t* table, uint32_t bytes, uint32_t mask) {
// Our choice is quicker-and-dirtier than the typical hash function;
// empirically, that seems beneficial. The upper bits of kMagic * bytes are a
// higher-quality hash than the lower bits, so when using kMagic * bytes we
// also shift right to get a higher-quality end result. There's no similar
// issue with a CRC because all of the output bits of a CRC are equally good
// "hashes." So, a CPU instruction for CRC, if available, tends to be a good
// choice.
#if SNAPPY_HAVE_NEON_CRC32
// We use mask as the second arg to the CRC function, as it's about to
// be used anyway; it'd be equally correct to use 0 or some constant.
// Mathematically, _mm_crc32_u32 (or similar) is a function of the
// xor of its arguments.
const uint32_t hash = __crc32cw(bytes, mask);
#elif SNAPPY_HAVE_X86_CRC32
const uint32_t hash = _mm_crc32_u32(bytes, mask);
#else
constexpr uint32_t kMagic = 0x1e35a7bd; constexpr uint32_t kMagic = 0x1e35a7bd;
return ((kMagic * bytes) >> (32 - kMaxHashTableBits)) & mask; const uint32_t hash = (kMagic * bytes) >> (31 - kMaxHashTableBits);
#endif
return reinterpret_cast<uint16_t*>(reinterpret_cast<uintptr_t>(table) +
(hash & mask));
} }
} // namespace } // namespace
@ -727,7 +765,7 @@ char* CompressFragment(const char* input, size_t input_size, char* op,
const char* ip = input; const char* ip = input;
assert(input_size <= kBlockSize); assert(input_size <= kBlockSize);
assert((table_size & (table_size - 1)) == 0); // table must be power of two assert((table_size & (table_size - 1)) == 0); // table must be power of two
const uint32_t mask = table_size - 1; const uint32_t mask = 2 * (table_size - 1);
const char* ip_end = input + input_size; const char* ip_end = input + input_size;
const char* base_ip = ip; const char* base_ip = ip;
@ -778,11 +816,11 @@ char* CompressFragment(const char* input, size_t input_size, char* op,
// loaded in preload. // loaded in preload.
uint32_t dword = i == 0 ? preload : static_cast<uint32_t>(data); uint32_t dword = i == 0 ? preload : static_cast<uint32_t>(data);
assert(dword == LittleEndian::Load32(ip + i)); assert(dword == LittleEndian::Load32(ip + i));
uint32_t hash = HashBytes(dword, mask); uint16_t* table_entry = TableEntry(table, dword, mask);
candidate = base_ip + table[hash]; candidate = base_ip + *table_entry;
assert(candidate >= base_ip); assert(candidate >= base_ip);
assert(candidate < ip + i); assert(candidate < ip + i);
table[hash] = delta + i; *table_entry = delta + i;
if (SNAPPY_PREDICT_FALSE(LittleEndian::Load32(candidate) == dword)) { if (SNAPPY_PREDICT_FALSE(LittleEndian::Load32(candidate) == dword)) {
*op = LITERAL | (i << 2); *op = LITERAL | (i << 2);
UnalignedCopy128(next_emit, op + 1); UnalignedCopy128(next_emit, op + 1);
@ -799,7 +837,7 @@ char* CompressFragment(const char* input, size_t input_size, char* op,
} }
while (true) { while (true) {
assert(static_cast<uint32_t>(data) == LittleEndian::Load32(ip)); assert(static_cast<uint32_t>(data) == LittleEndian::Load32(ip));
uint32_t hash = HashBytes(data, mask); uint16_t* table_entry = TableEntry(table, data, mask);
uint32_t bytes_between_hash_lookups = skip >> 5; uint32_t bytes_between_hash_lookups = skip >> 5;
skip += bytes_between_hash_lookups; skip += bytes_between_hash_lookups;
const char* next_ip = ip + bytes_between_hash_lookups; const char* next_ip = ip + bytes_between_hash_lookups;
@ -807,11 +845,11 @@ char* CompressFragment(const char* input, size_t input_size, char* op,
ip = next_emit; ip = next_emit;
goto emit_remainder; goto emit_remainder;
} }
candidate = base_ip + table[hash]; candidate = base_ip + *table_entry;
assert(candidate >= base_ip); assert(candidate >= base_ip);
assert(candidate < ip); assert(candidate < ip);
table[hash] = ip - base_ip; *table_entry = ip - base_ip;
if (SNAPPY_PREDICT_FALSE(static_cast<uint32_t>(data) == if (SNAPPY_PREDICT_FALSE(static_cast<uint32_t>(data) ==
LittleEndian::Load32(candidate))) { LittleEndian::Load32(candidate))) {
break; break;
@ -857,12 +895,13 @@ char* CompressFragment(const char* input, size_t input_size, char* op,
assert((data & 0xFFFFFFFFFF) == assert((data & 0xFFFFFFFFFF) ==
(LittleEndian::Load64(ip) & 0xFFFFFFFFFF)); (LittleEndian::Load64(ip) & 0xFFFFFFFFFF));
// We are now looking for a 4-byte match again. We read // We are now looking for a 4-byte match again. We read
// table[Hash(ip, shift)] for that. To improve compression, // table[Hash(ip, mask)] for that. To improve compression,
// we also update table[Hash(ip - 1, mask)] and table[Hash(ip, mask)]. // we also update table[Hash(ip - 1, mask)] and table[Hash(ip, mask)].
table[HashBytes(LittleEndian::Load32(ip - 1), mask)] = ip - base_ip - 1; *TableEntry(table, LittleEndian::Load32(ip - 1), mask) =
uint32_t hash = HashBytes(data, mask); ip - base_ip - 1;
candidate = base_ip + table[hash]; uint16_t* table_entry = TableEntry(table, data, mask);
table[hash] = ip - base_ip; candidate = base_ip + *table_entry;
*table_entry = ip - base_ip;
// Measurements on the benchmarks have shown the following probabilities // Measurements on the benchmarks have shown the following probabilities
// for the loop to exit (ie. avg. number of iterations is reciprocal). // for the loop to exit (ie. avg. number of iterations is reciprocal).
// BM_Flat/6 txt1 p = 0.3-0.4 // BM_Flat/6 txt1 p = 0.3-0.4