Modify MemCopy64 to use AVX 32 byte copies instead of SSE2 16 byte copies on capable x86 platforms. This gives an average speedup of 6.87% on Milan and 1.90% on Skylake.

PiperOrigin-RevId: 480370725
This commit is contained in:
Snappy Team 2022-10-11 16:00:34 +00:00 committed by Victor Costan
parent 984b191f0f
commit a2d219a8a8
1 changed files with 16 additions and 7 deletions

View File

@ -989,27 +989,36 @@ inline bool Copy64BytesWithPatternExtension(ptrdiff_t dst, size_t offset) {
// so gives better performance. [src, src + size) must not overlap with
// [dst, dst + size), but [src, src + 64) may overlap with [dst, dst + 64).
void MemCopy64(char* dst, const void* src, size_t size) {
// Always copy this many bytes, test if we need to copy more.
// Always copy this many bytes. If that's below size then copy the full 64.
constexpr int kShortMemCopy = 32;
// We're always allowed to copy 64 bytes, so if we exceed kShortMemCopy just
// copy 64 rather than the exact amount.
constexpr int kLongMemCopy = 64;
assert(size <= kLongMemCopy);
assert(size <= 64);
assert(std::less_equal<const void*>()(static_cast<const char*>(src) + size,
dst) ||
std::less_equal<const void*>()(dst + size, src));
// We know that src and dst are at least size bytes apart. However, because we
// might copy more than size bytes the copy still might overlap past size.
// E.g. if src and dst appear consecutively in memory (src + size == dst).
// E.g. if src and dst appear consecutively in memory (src + size >= dst).
// TODO: Investigate wider copies on other platforms.
#if defined(__x86_64__) && defined(__AVX__)
assert(kShortMemCopy <= 32);
__m256i data = _mm256_lddqu_si256(static_cast<const __m256i *>(src));
_mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), data);
// Profiling shows that nearly all copies are short.
if (SNAPPY_PREDICT_FALSE(size > kShortMemCopy)) {
data = _mm256_lddqu_si256(static_cast<const __m256i *>(src) + 1);
_mm256_storeu_si256(reinterpret_cast<__m256i *>(dst) + 1, data);
}
#else
std::memmove(dst, src, kShortMemCopy);
// Profiling shows that nearly all copies are short.
if (SNAPPY_PREDICT_FALSE(size > kShortMemCopy)) {
std::memmove(dst + kShortMemCopy,
static_cast<const uint8_t*>(src) + kShortMemCopy,
kLongMemCopy - kShortMemCopy);
64 - kShortMemCopy);
}
#endif
}
void MemCopy64(ptrdiff_t dst, const void* src, size_t size) {