mirror of https://github.com/google/snappy.git
Use 64-bit optimized code path for ARM64.
This is inspired by https://github.com/google/snappy/pull/22. Benchmark results with the change, Pixel C with Android N2G48B Benchmark Time(ns) CPU(ns) Iterations --------------------------------------------------- BM_UFlat/0 119544 119253 1501 818.9MB/s html BM_UFlat/1 1223950 1208588 163 554.0MB/s urls BM_UFlat/2 16081 15962 11527 7.2GB/s jpg BM_UFlat/3 356 352 416666 540.6MB/s jpg_200 BM_UFlat/4 25010 24860 7683 3.8GB/s pdf BM_UFlat/5 484832 481572 407 811.1MB/s html4 BM_UFlat/6 408410 408713 482 354.9MB/s txt1 BM_UFlat/7 361714 361663 553 330.1MB/s txt2 BM_UFlat/8 1090582 1087912 182 374.1MB/s txt3 BM_UFlat/9 1503127 1503759 133 305.6MB/s txt4 BM_UFlat/10 114183 114285 1715 989.6MB/s pb BM_UFlat/11 406714 407331 491 431.5MB/s gaviota BM_UIOVec/0 370397 369888 538 264.0MB/s html BM_UIOVec/1 3207510 3190000 100 209.9MB/s urls BM_UIOVec/2 16589 16573 11223 6.9GB/s jpg BM_UIOVec/3 1052 1052 165289 181.2MB/s jpg_200 BM_UIOVec/4 49151 49184 3985 1.9GB/s pdf BM_UValidate/0 68115 68095 2893 1.4GB/s html BM_UValidate/1 792652 792000 250 845.4MB/s urls BM_UValidate/2 334 334 487804 343.1GB/s jpg BM_UValidate/3 235 235 666666 809.9MB/s jpg_200 BM_UValidate/4 6126 6130 32626 15.6GB/s pdf BM_ZFlat/0 292697 290560 678 336.1MB/s html (22.31 %) BM_ZFlat/1 4062080 4050000 100 165.3MB/s urls (47.78 %) BM_ZFlat/2 29225 29274 6422 3.9GB/s jpg (99.95 %) BM_ZFlat/3 1099 1098 163934 173.7MB/s jpg_200 (73.00 %) BM_ZFlat/4 44117 44233 4205 2.2GB/s pdf (83.30 %) BM_ZFlat/5 1158058 1157894 171 337.4MB/s html4 (22.52 %) BM_ZFlat/6 1102983 1093922 181 132.6MB/s txt1 (57.88 %) BM_ZFlat/7 974142 975490 204 122.4MB/s txt2 (61.91 %) BM_ZFlat/8 2984670 2990000 100 136.1MB/s txt3 (54.99 %) BM_ZFlat/9 4100130 4090000 100 112.4MB/s txt4 (66.26 %) BM_ZFlat/10 276236 275139 716 411.0MB/s pb (19.68 %) BM_ZFlat/11 760091 759541 262 231.4MB/s gaviota (37.72 %) Baseline benchmark results, Pixel C with Android N2G48B Benchmark Time(ns) CPU(ns) Iterations --------------------------------------------------- BM_UFlat/0 148957 147565 1335 661.8MB/s html BM_UFlat/1 1527257 1500000 132 446.4MB/s urls BM_UFlat/2 19589 19397 8764 5.9GB/s jpg BM_UFlat/3 425 418 408163 455.3MB/s jpg_200 BM_UFlat/4 30096 29552 6497 3.2GB/s pdf BM_UFlat/5 595933 594594 333 657.0MB/s html4 BM_UFlat/6 516315 514360 383 282.0MB/s txt1 BM_UFlat/7 454653 453514 441 263.2MB/s txt2 BM_UFlat/8 1382687 1361111 144 299.0MB/s txt3 BM_UFlat/9 1967590 1904761 105 241.3MB/s txt4 BM_UFlat/10 148271 144560 1342 782.3MB/s pb BM_UFlat/11 523997 510471 382 344.4MB/s gaviota BM_UIOVec/0 478443 465227 417 209.9MB/s html BM_UIOVec/1 4172860 4060000 100 164.9MB/s urls BM_UIOVec/2 21470 20975 7342 5.5GB/s jpg BM_UIOVec/3 1357 1330 75187 143.4MB/s jpg_200 BM_UIOVec/4 63143 61365 3031 1.6GB/s pdf BM_UValidate/0 86910 85125 2279 1.1GB/s html BM_UValidate/1 1022256 1000000 195 669.6MB/s urls BM_UValidate/2 420 417 400000 274.6GB/s jpg BM_UValidate/3 311 302 571428 630.0MB/s jpg_200 BM_UValidate/4 7778 7584 25445 12.6GB/s pdf BM_ZFlat/0 469209 457547 424 213.4MB/s html (22.31 %) BM_ZFlat/1 5633510 5460000 100 122.6MB/s urls (47.78 %) BM_ZFlat/2 37896 36693 4524 3.1GB/s jpg (99.95 %) BM_ZFlat/3 1485 1441 123456 132.3MB/s jpg_200 (73.00 %) BM_ZFlat/4 74870 72775 2652 1.3GB/s pdf (83.30 %) BM_ZFlat/5 1857321 1785714 112 218.8MB/s html4 (22.52 %) BM_ZFlat/6 1538723 1492307 130 97.2MB/s txt1 (57.88 %) BM_ZFlat/7 1338236 1310810 148 91.1MB/s txt2 (61.91 %) BM_ZFlat/8 4050820 4040000 100 100.7MB/s txt3 (54.99 %) BM_ZFlat/9 5234940 5230000 100 87.9MB/s txt4 (66.26 %) BM_ZFlat/10 400309 400000 495 282.7MB/s pb (19.68 %) BM_ZFlat/11 1063042 1058510 188 166.1MB/s gaviota (37.72 %)
This commit is contained in:
parent
77c12adc19
commit
632cd0f128
|
@ -8,10 +8,7 @@ option(BUILD_SHARED_LIBS "Build shared libraries(DLLs)." OFF)
|
||||||
option(SNAPPY_BUILD_TESTS "Build Snappy's own tests." ON)
|
option(SNAPPY_BUILD_TESTS "Build Snappy's own tests." ON)
|
||||||
|
|
||||||
include(TestBigEndian)
|
include(TestBigEndian)
|
||||||
test_big_endian(WORDS_BIG_ENDIAN)
|
test_big_endian(SNAPPY_IS_BIG_ENDIAN)
|
||||||
if(WORDS_BIG_ENDIAN)
|
|
||||||
add_definitions(-DWORDS_BIGENDIAN=1)
|
|
||||||
endif(WORDS_BIG_ENDIAN)
|
|
||||||
|
|
||||||
include(CheckIncludeFile)
|
include(CheckIncludeFile)
|
||||||
check_include_file("byteswap.h" HAVE_BYTESWAP_H)
|
check_include_file("byteswap.h" HAVE_BYTESWAP_H)
|
||||||
|
|
|
@ -55,8 +55,8 @@
|
||||||
/* Define to 1 if you have the <windows.h> header file. */
|
/* Define to 1 if you have the <windows.h> header file. */
|
||||||
#cmakedefine HAVE_WINDOWS_H 1
|
#cmakedefine HAVE_WINDOWS_H 1
|
||||||
|
|
||||||
/* Define WORDS_BIGENDIAN to 1 if your processor stores words with the most
|
/* Define to 1 if your processor stores words with the most significant byte
|
||||||
significant byte first (like Motorola and SPARC, unlike Intel and VAX). */
|
first (like Motorola and SPARC, unlike Intel and VAX). */
|
||||||
#cmakedefine WORDS_BIGENDIAN 1
|
#cmakedefine SNAPPY_IS_BIG_ENDIAN 1
|
||||||
|
|
||||||
#endif // THIRD_PARTY_SNAPPY_OPENSOURCE_CMAKE_CONFIG_H_
|
#endif // THIRD_PARTY_SNAPPY_OPENSOURCE_CMAKE_CONFIG_H_
|
||||||
|
|
|
@ -83,7 +83,8 @@ char* CompressFragment(const char* input,
|
||||||
// Requires that s2_limit >= s2.
|
// Requires that s2_limit >= s2.
|
||||||
//
|
//
|
||||||
// Separate implementation for 64-bit, little-endian cpus.
|
// Separate implementation for 64-bit, little-endian cpus.
|
||||||
#if defined(ARCH_K8) || (defined(ARCH_PPC) && !defined(WORDS_BIGENDIAN))
|
#if !defined(SNAPPY_IS_BIG_ENDIAN) && \
|
||||||
|
(defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM))
|
||||||
static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
|
static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
|
||||||
const char* s2,
|
const char* s2,
|
||||||
const char* s2_limit) {
|
const char* s2_limit) {
|
||||||
|
|
|
@ -64,6 +64,10 @@
|
||||||
|
|
||||||
#define ARCH_PPC 1
|
#define ARCH_PPC 1
|
||||||
|
|
||||||
|
#elif defined(__aarch64__)
|
||||||
|
|
||||||
|
#define ARCH_ARM 1
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Needed by OS X, among others.
|
// Needed by OS X, among others.
|
||||||
|
@ -104,9 +108,10 @@ static const int64 kint64max = static_cast<int64>(0x7FFFFFFFFFFFFFFFLL);
|
||||||
|
|
||||||
// Potentially unaligned loads and stores.
|
// Potentially unaligned loads and stores.
|
||||||
|
|
||||||
// x86 and PowerPC can simply do these loads and stores native.
|
// x86, PowerPC, and ARM64 can simply do these loads and stores native.
|
||||||
|
|
||||||
#if defined(__i386__) || defined(__x86_64__) || defined(__powerpc__)
|
#if defined(__i386__) || defined(__x86_64__) || defined(__powerpc__) || \
|
||||||
|
defined(__aarch64__)
|
||||||
|
|
||||||
#define UNALIGNED_LOAD16(_p) (*reinterpret_cast<const uint16 *>(_p))
|
#define UNALIGNED_LOAD16(_p) (*reinterpret_cast<const uint16 *>(_p))
|
||||||
#define UNALIGNED_LOAD32(_p) (*reinterpret_cast<const uint32 *>(_p))
|
#define UNALIGNED_LOAD32(_p) (*reinterpret_cast<const uint32 *>(_p))
|
||||||
|
@ -234,7 +239,7 @@ inline void UNALIGNED_STORE64(void *p, uint64 v) {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// The following guarantees declaration of the byte swap functions.
|
// The following guarantees declaration of the byte swap functions.
|
||||||
#ifdef WORDS_BIGENDIAN
|
#if defined(SNAPPY_IS_BIG_ENDIAN)
|
||||||
|
|
||||||
#ifdef HAVE_SYS_BYTEORDER_H
|
#ifdef HAVE_SYS_BYTEORDER_H
|
||||||
#include <sys/byteorder.h>
|
#include <sys/byteorder.h>
|
||||||
|
@ -291,7 +296,7 @@ inline uint64 bswap_64(uint64 x) {
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#endif // WORDS_BIGENDIAN
|
#endif // defined(SNAPPY_IS_BIG_ENDIAN)
|
||||||
|
|
||||||
// Convert to little-endian storage, opposite of network format.
|
// Convert to little-endian storage, opposite of network format.
|
||||||
// Convert x from host to little endian: x = LittleEndian.FromHost(x);
|
// Convert x from host to little endian: x = LittleEndian.FromHost(x);
|
||||||
|
@ -305,7 +310,7 @@ inline uint64 bswap_64(uint64 x) {
|
||||||
class LittleEndian {
|
class LittleEndian {
|
||||||
public:
|
public:
|
||||||
// Conversion functions.
|
// Conversion functions.
|
||||||
#ifdef WORDS_BIGENDIAN
|
#if defined(SNAPPY_IS_BIG_ENDIAN)
|
||||||
|
|
||||||
static uint16 FromHost16(uint16 x) { return bswap_16(x); }
|
static uint16 FromHost16(uint16 x) { return bswap_16(x); }
|
||||||
static uint16 ToHost16(uint16 x) { return bswap_16(x); }
|
static uint16 ToHost16(uint16 x) { return bswap_16(x); }
|
||||||
|
@ -315,7 +320,7 @@ class LittleEndian {
|
||||||
|
|
||||||
static bool IsLittleEndian() { return false; }
|
static bool IsLittleEndian() { return false; }
|
||||||
|
|
||||||
#else // !defined(WORDS_BIGENDIAN)
|
#else // !defined(SNAPPY_IS_BIG_ENDIAN)
|
||||||
|
|
||||||
static uint16 FromHost16(uint16 x) { return x; }
|
static uint16 FromHost16(uint16 x) { return x; }
|
||||||
static uint16 ToHost16(uint16 x) { return x; }
|
static uint16 ToHost16(uint16 x) { return x; }
|
||||||
|
@ -325,7 +330,7 @@ class LittleEndian {
|
||||||
|
|
||||||
static bool IsLittleEndian() { return true; }
|
static bool IsLittleEndian() { return true; }
|
||||||
|
|
||||||
#endif // !defined(WORDS_BIGENDIAN)
|
#endif // !defined(SNAPPY_IS_BIG_ENDIAN)
|
||||||
|
|
||||||
// Functions to do unaligned loads and stores in little-endian order.
|
// Functions to do unaligned loads and stores in little-endian order.
|
||||||
static uint16 Load16(const void *p) {
|
static uint16 Load16(const void *p) {
|
||||||
|
@ -356,9 +361,9 @@ class Bits {
|
||||||
// that it's 0-indexed.
|
// that it's 0-indexed.
|
||||||
static int FindLSBSetNonZero(uint32 n);
|
static int FindLSBSetNonZero(uint32 n);
|
||||||
|
|
||||||
#if defined(ARCH_K8) || defined(ARCH_PPC)
|
#if defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM)
|
||||||
static int FindLSBSetNonZero64(uint64 n);
|
static int FindLSBSetNonZero64(uint64 n);
|
||||||
#endif // defined(ARCH_K8) || defined(ARCH_PPC)
|
#endif // defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM)
|
||||||
|
|
||||||
private:
|
private:
|
||||||
// No copying
|
// No copying
|
||||||
|
@ -376,11 +381,11 @@ inline int Bits::FindLSBSetNonZero(uint32 n) {
|
||||||
return __builtin_ctz(n);
|
return __builtin_ctz(n);
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(ARCH_K8) || defined(ARCH_PPC)
|
#if defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM)
|
||||||
inline int Bits::FindLSBSetNonZero64(uint64 n) {
|
inline int Bits::FindLSBSetNonZero64(uint64 n) {
|
||||||
return __builtin_ctzll(n);
|
return __builtin_ctzll(n);
|
||||||
}
|
}
|
||||||
#endif // defined(ARCH_K8) || defined(ARCH_PPC)
|
#endif // defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM)
|
||||||
|
|
||||||
#elif defined(_MSC_VER)
|
#elif defined(_MSC_VER)
|
||||||
|
|
||||||
|
@ -399,13 +404,13 @@ inline int Bits::FindLSBSetNonZero(uint32 n) {
|
||||||
return 32;
|
return 32;
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(ARCH_K8) || defined(ARCH_PPC)
|
#if defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM)
|
||||||
inline int Bits::FindLSBSetNonZero64(uint64 n) {
|
inline int Bits::FindLSBSetNonZero64(uint64 n) {
|
||||||
unsigned long where;
|
unsigned long where;
|
||||||
if (_BitScanForward64(&where, n)) return static_cast<int>(where);
|
if (_BitScanForward64(&where, n)) return static_cast<int>(where);
|
||||||
return 64;
|
return 64;
|
||||||
}
|
}
|
||||||
#endif // defined(ARCH_K8) || defined(ARCH_PPC)
|
#endif // defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM)
|
||||||
|
|
||||||
#else // Portable versions.
|
#else // Portable versions.
|
||||||
|
|
||||||
|
@ -439,7 +444,7 @@ inline int Bits::FindLSBSetNonZero(uint32 n) {
|
||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(ARCH_K8) || defined(ARCH_PPC)
|
#if defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM)
|
||||||
// FindLSBSetNonZero64() is defined in terms of FindLSBSetNonZero().
|
// FindLSBSetNonZero64() is defined in terms of FindLSBSetNonZero().
|
||||||
inline int Bits::FindLSBSetNonZero64(uint64 n) {
|
inline int Bits::FindLSBSetNonZero64(uint64 n) {
|
||||||
const uint32 bottombits = static_cast<uint32>(n);
|
const uint32 bottombits = static_cast<uint32>(n);
|
||||||
|
@ -450,7 +455,7 @@ inline int Bits::FindLSBSetNonZero64(uint64 n) {
|
||||||
return FindLSBSetNonZero(bottombits);
|
return FindLSBSetNonZero(bottombits);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif // defined(ARCH_K8) || defined(ARCH_PPC)
|
#endif // defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM)
|
||||||
|
|
||||||
#endif // End portable versions.
|
#endif // End portable versions.
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue