From 632cd0f12856ef7e1feda46021940a98d7117bda Mon Sep 17 00:00:00 2001 From: costan Date: Wed, 16 Aug 2017 12:38:06 -0700 Subject: [PATCH] Use 64-bit optimized code path for ARM64. This is inspired by https://github.com/google/snappy/pull/22. Benchmark results with the change, Pixel C with Android N2G48B Benchmark Time(ns) CPU(ns) Iterations --------------------------------------------------- BM_UFlat/0 119544 119253 1501 818.9MB/s html BM_UFlat/1 1223950 1208588 163 554.0MB/s urls BM_UFlat/2 16081 15962 11527 7.2GB/s jpg BM_UFlat/3 356 352 416666 540.6MB/s jpg_200 BM_UFlat/4 25010 24860 7683 3.8GB/s pdf BM_UFlat/5 484832 481572 407 811.1MB/s html4 BM_UFlat/6 408410 408713 482 354.9MB/s txt1 BM_UFlat/7 361714 361663 553 330.1MB/s txt2 BM_UFlat/8 1090582 1087912 182 374.1MB/s txt3 BM_UFlat/9 1503127 1503759 133 305.6MB/s txt4 BM_UFlat/10 114183 114285 1715 989.6MB/s pb BM_UFlat/11 406714 407331 491 431.5MB/s gaviota BM_UIOVec/0 370397 369888 538 264.0MB/s html BM_UIOVec/1 3207510 3190000 100 209.9MB/s urls BM_UIOVec/2 16589 16573 11223 6.9GB/s jpg BM_UIOVec/3 1052 1052 165289 181.2MB/s jpg_200 BM_UIOVec/4 49151 49184 3985 1.9GB/s pdf BM_UValidate/0 68115 68095 2893 1.4GB/s html BM_UValidate/1 792652 792000 250 845.4MB/s urls BM_UValidate/2 334 334 487804 343.1GB/s jpg BM_UValidate/3 235 235 666666 809.9MB/s jpg_200 BM_UValidate/4 6126 6130 32626 15.6GB/s pdf BM_ZFlat/0 292697 290560 678 336.1MB/s html (22.31 %) BM_ZFlat/1 4062080 4050000 100 165.3MB/s urls (47.78 %) BM_ZFlat/2 29225 29274 6422 3.9GB/s jpg (99.95 %) BM_ZFlat/3 1099 1098 163934 173.7MB/s jpg_200 (73.00 %) BM_ZFlat/4 44117 44233 4205 2.2GB/s pdf (83.30 %) BM_ZFlat/5 1158058 1157894 171 337.4MB/s html4 (22.52 %) BM_ZFlat/6 1102983 1093922 181 132.6MB/s txt1 (57.88 %) BM_ZFlat/7 974142 975490 204 122.4MB/s txt2 (61.91 %) BM_ZFlat/8 2984670 2990000 100 136.1MB/s txt3 (54.99 %) BM_ZFlat/9 4100130 4090000 100 112.4MB/s txt4 (66.26 %) BM_ZFlat/10 276236 275139 716 411.0MB/s pb (19.68 %) BM_ZFlat/11 760091 759541 262 231.4MB/s gaviota (37.72 %) Baseline benchmark results, Pixel C with Android N2G48B Benchmark Time(ns) CPU(ns) Iterations --------------------------------------------------- BM_UFlat/0 148957 147565 1335 661.8MB/s html BM_UFlat/1 1527257 1500000 132 446.4MB/s urls BM_UFlat/2 19589 19397 8764 5.9GB/s jpg BM_UFlat/3 425 418 408163 455.3MB/s jpg_200 BM_UFlat/4 30096 29552 6497 3.2GB/s pdf BM_UFlat/5 595933 594594 333 657.0MB/s html4 BM_UFlat/6 516315 514360 383 282.0MB/s txt1 BM_UFlat/7 454653 453514 441 263.2MB/s txt2 BM_UFlat/8 1382687 1361111 144 299.0MB/s txt3 BM_UFlat/9 1967590 1904761 105 241.3MB/s txt4 BM_UFlat/10 148271 144560 1342 782.3MB/s pb BM_UFlat/11 523997 510471 382 344.4MB/s gaviota BM_UIOVec/0 478443 465227 417 209.9MB/s html BM_UIOVec/1 4172860 4060000 100 164.9MB/s urls BM_UIOVec/2 21470 20975 7342 5.5GB/s jpg BM_UIOVec/3 1357 1330 75187 143.4MB/s jpg_200 BM_UIOVec/4 63143 61365 3031 1.6GB/s pdf BM_UValidate/0 86910 85125 2279 1.1GB/s html BM_UValidate/1 1022256 1000000 195 669.6MB/s urls BM_UValidate/2 420 417 400000 274.6GB/s jpg BM_UValidate/3 311 302 571428 630.0MB/s jpg_200 BM_UValidate/4 7778 7584 25445 12.6GB/s pdf BM_ZFlat/0 469209 457547 424 213.4MB/s html (22.31 %) BM_ZFlat/1 5633510 5460000 100 122.6MB/s urls (47.78 %) BM_ZFlat/2 37896 36693 4524 3.1GB/s jpg (99.95 %) BM_ZFlat/3 1485 1441 123456 132.3MB/s jpg_200 (73.00 %) BM_ZFlat/4 74870 72775 2652 1.3GB/s pdf (83.30 %) BM_ZFlat/5 1857321 1785714 112 218.8MB/s html4 (22.52 %) BM_ZFlat/6 1538723 1492307 130 97.2MB/s txt1 (57.88 %) BM_ZFlat/7 1338236 1310810 148 91.1MB/s txt2 (61.91 %) BM_ZFlat/8 4050820 4040000 100 100.7MB/s txt3 (54.99 %) BM_ZFlat/9 5234940 5230000 100 87.9MB/s txt4 (66.26 %) BM_ZFlat/10 400309 400000 495 282.7MB/s pb (19.68 %) BM_ZFlat/11 1063042 1058510 188 166.1MB/s gaviota (37.72 %) --- CMakeLists.txt | 5 +---- cmake/config.h.in | 6 +++--- snappy-internal.h | 3 ++- snappy-stubs-internal.h | 35 ++++++++++++++++++++--------------- 4 files changed, 26 insertions(+), 23 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e9e70c8..2a90a08 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,10 +8,7 @@ option(BUILD_SHARED_LIBS "Build shared libraries(DLLs)." OFF) option(SNAPPY_BUILD_TESTS "Build Snappy's own tests." ON) include(TestBigEndian) -test_big_endian(WORDS_BIG_ENDIAN) -if(WORDS_BIG_ENDIAN) - add_definitions(-DWORDS_BIGENDIAN=1) -endif(WORDS_BIG_ENDIAN) +test_big_endian(SNAPPY_IS_BIG_ENDIAN) include(CheckIncludeFile) check_include_file("byteswap.h" HAVE_BYTESWAP_H) diff --git a/cmake/config.h.in b/cmake/config.h.in index 64f2648..97cd818 100644 --- a/cmake/config.h.in +++ b/cmake/config.h.in @@ -55,8 +55,8 @@ /* Define to 1 if you have the header file. */ #cmakedefine HAVE_WINDOWS_H 1 -/* Define WORDS_BIGENDIAN to 1 if your processor stores words with the most - significant byte first (like Motorola and SPARC, unlike Intel and VAX). */ -#cmakedefine WORDS_BIGENDIAN 1 +/* Define to 1 if your processor stores words with the most significant byte + first (like Motorola and SPARC, unlike Intel and VAX). */ +#cmakedefine SNAPPY_IS_BIG_ENDIAN 1 #endif // THIRD_PARTY_SNAPPY_OPENSOURCE_CMAKE_CONFIG_H_ diff --git a/snappy-internal.h b/snappy-internal.h index c12637d..4b53d59 100644 --- a/snappy-internal.h +++ b/snappy-internal.h @@ -83,7 +83,8 @@ char* CompressFragment(const char* input, // Requires that s2_limit >= s2. // // Separate implementation for 64-bit, little-endian cpus. -#if defined(ARCH_K8) || (defined(ARCH_PPC) && !defined(WORDS_BIGENDIAN)) +#if !defined(SNAPPY_IS_BIG_ENDIAN) && \ + (defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM)) static inline std::pair FindMatchLength(const char* s1, const char* s2, const char* s2_limit) { diff --git a/snappy-stubs-internal.h b/snappy-stubs-internal.h index 9898f18..cb605f8 100644 --- a/snappy-stubs-internal.h +++ b/snappy-stubs-internal.h @@ -64,6 +64,10 @@ #define ARCH_PPC 1 +#elif defined(__aarch64__) + +#define ARCH_ARM 1 + #endif // Needed by OS X, among others. @@ -104,9 +108,10 @@ static const int64 kint64max = static_cast(0x7FFFFFFFFFFFFFFFLL); // Potentially unaligned loads and stores. -// x86 and PowerPC can simply do these loads and stores native. +// x86, PowerPC, and ARM64 can simply do these loads and stores native. -#if defined(__i386__) || defined(__x86_64__) || defined(__powerpc__) +#if defined(__i386__) || defined(__x86_64__) || defined(__powerpc__) || \ + defined(__aarch64__) #define UNALIGNED_LOAD16(_p) (*reinterpret_cast(_p)) #define UNALIGNED_LOAD32(_p) (*reinterpret_cast(_p)) @@ -234,7 +239,7 @@ inline void UNALIGNED_STORE64(void *p, uint64 v) { #endif // The following guarantees declaration of the byte swap functions. -#ifdef WORDS_BIGENDIAN +#if defined(SNAPPY_IS_BIG_ENDIAN) #ifdef HAVE_SYS_BYTEORDER_H #include @@ -291,7 +296,7 @@ inline uint64 bswap_64(uint64 x) { #endif -#endif // WORDS_BIGENDIAN +#endif // defined(SNAPPY_IS_BIG_ENDIAN) // Convert to little-endian storage, opposite of network format. // Convert x from host to little endian: x = LittleEndian.FromHost(x); @@ -305,7 +310,7 @@ inline uint64 bswap_64(uint64 x) { class LittleEndian { public: // Conversion functions. -#ifdef WORDS_BIGENDIAN +#if defined(SNAPPY_IS_BIG_ENDIAN) static uint16 FromHost16(uint16 x) { return bswap_16(x); } static uint16 ToHost16(uint16 x) { return bswap_16(x); } @@ -315,7 +320,7 @@ class LittleEndian { static bool IsLittleEndian() { return false; } -#else // !defined(WORDS_BIGENDIAN) +#else // !defined(SNAPPY_IS_BIG_ENDIAN) static uint16 FromHost16(uint16 x) { return x; } static uint16 ToHost16(uint16 x) { return x; } @@ -325,7 +330,7 @@ class LittleEndian { static bool IsLittleEndian() { return true; } -#endif // !defined(WORDS_BIGENDIAN) +#endif // !defined(SNAPPY_IS_BIG_ENDIAN) // Functions to do unaligned loads and stores in little-endian order. static uint16 Load16(const void *p) { @@ -356,9 +361,9 @@ class Bits { // that it's 0-indexed. static int FindLSBSetNonZero(uint32 n); -#if defined(ARCH_K8) || defined(ARCH_PPC) +#if defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM) static int FindLSBSetNonZero64(uint64 n); -#endif // defined(ARCH_K8) || defined(ARCH_PPC) +#endif // defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM) private: // No copying @@ -376,11 +381,11 @@ inline int Bits::FindLSBSetNonZero(uint32 n) { return __builtin_ctz(n); } -#if defined(ARCH_K8) || defined(ARCH_PPC) +#if defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM) inline int Bits::FindLSBSetNonZero64(uint64 n) { return __builtin_ctzll(n); } -#endif // defined(ARCH_K8) || defined(ARCH_PPC) +#endif // defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM) #elif defined(_MSC_VER) @@ -399,13 +404,13 @@ inline int Bits::FindLSBSetNonZero(uint32 n) { return 32; } -#if defined(ARCH_K8) || defined(ARCH_PPC) +#if defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM) inline int Bits::FindLSBSetNonZero64(uint64 n) { unsigned long where; if (_BitScanForward64(&where, n)) return static_cast(where); return 64; } -#endif // defined(ARCH_K8) || defined(ARCH_PPC) +#endif // defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM) #else // Portable versions. @@ -439,7 +444,7 @@ inline int Bits::FindLSBSetNonZero(uint32 n) { return rc; } -#if defined(ARCH_K8) || defined(ARCH_PPC) +#if defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM) // FindLSBSetNonZero64() is defined in terms of FindLSBSetNonZero(). inline int Bits::FindLSBSetNonZero64(uint64 n) { const uint32 bottombits = static_cast(n); @@ -450,7 +455,7 @@ inline int Bits::FindLSBSetNonZero64(uint64 n) { return FindLSBSetNonZero(bottombits); } } -#endif // defined(ARCH_K8) || defined(ARCH_PPC) +#endif // defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM) #endif // End portable versions.