Use 64-bit optimized code path for ARM64.

This is inspired by https://github.com/google/snappy/pull/22.

Benchmark results with the change, Pixel C with Android N2G48B

Benchmark            Time(ns)    CPU(ns) Iterations
---------------------------------------------------
BM_UFlat/0             119544     119253       1501 818.9MB/s  html
BM_UFlat/1            1223950    1208588        163 554.0MB/s  urls
BM_UFlat/2              16081      15962      11527 7.2GB/s  jpg
BM_UFlat/3                356        352     416666 540.6MB/s  jpg_200
BM_UFlat/4              25010      24860       7683 3.8GB/s  pdf
BM_UFlat/5             484832     481572        407 811.1MB/s  html4
BM_UFlat/6             408410     408713        482 354.9MB/s  txt1
BM_UFlat/7             361714     361663        553 330.1MB/s  txt2
BM_UFlat/8            1090582    1087912        182 374.1MB/s  txt3
BM_UFlat/9            1503127    1503759        133 305.6MB/s  txt4
BM_UFlat/10            114183     114285       1715 989.6MB/s  pb
BM_UFlat/11            406714     407331        491 431.5MB/s  gaviota
BM_UIOVec/0            370397     369888        538 264.0MB/s  html
BM_UIOVec/1           3207510    3190000        100 209.9MB/s  urls
BM_UIOVec/2             16589      16573      11223 6.9GB/s  jpg
BM_UIOVec/3              1052       1052     165289 181.2MB/s  jpg_200
BM_UIOVec/4             49151      49184       3985 1.9GB/s  pdf
BM_UValidate/0          68115      68095       2893 1.4GB/s  html
BM_UValidate/1         792652     792000        250 845.4MB/s  urls
BM_UValidate/2            334        334     487804 343.1GB/s  jpg
BM_UValidate/3            235        235     666666 809.9MB/s  jpg_200
BM_UValidate/4           6126       6130      32626 15.6GB/s  pdf
BM_ZFlat/0             292697     290560        678 336.1MB/s  html (22.31 %)
BM_ZFlat/1            4062080    4050000        100 165.3MB/s  urls (47.78 %)
BM_ZFlat/2              29225      29274       6422 3.9GB/s  jpg (99.95 %)
BM_ZFlat/3               1099       1098     163934 173.7MB/s  jpg_200 (73.00 %)
BM_ZFlat/4              44117      44233       4205 2.2GB/s  pdf (83.30 %)
BM_ZFlat/5            1158058    1157894        171 337.4MB/s  html4 (22.52 %)
BM_ZFlat/6            1102983    1093922        181 132.6MB/s  txt1 (57.88 %)
BM_ZFlat/7             974142     975490        204 122.4MB/s  txt2 (61.91 %)
BM_ZFlat/8            2984670    2990000        100 136.1MB/s  txt3 (54.99 %)
BM_ZFlat/9            4100130    4090000        100 112.4MB/s  txt4 (66.26 %)
BM_ZFlat/10            276236     275139        716 411.0MB/s  pb (19.68 %)
BM_ZFlat/11            760091     759541        262 231.4MB/s  gaviota (37.72 %)

Baseline benchmark results, Pixel C with Android N2G48B

Benchmark            Time(ns)    CPU(ns) Iterations
---------------------------------------------------
BM_UFlat/0             148957     147565       1335 661.8MB/s  html
BM_UFlat/1            1527257    1500000        132 446.4MB/s  urls
BM_UFlat/2              19589      19397       8764 5.9GB/s  jpg
BM_UFlat/3                425        418     408163 455.3MB/s  jpg_200
BM_UFlat/4              30096      29552       6497 3.2GB/s  pdf
BM_UFlat/5             595933     594594        333 657.0MB/s  html4
BM_UFlat/6             516315     514360        383 282.0MB/s  txt1
BM_UFlat/7             454653     453514        441 263.2MB/s  txt2
BM_UFlat/8            1382687    1361111        144 299.0MB/s  txt3
BM_UFlat/9            1967590    1904761        105 241.3MB/s  txt4
BM_UFlat/10            148271     144560       1342 782.3MB/s  pb
BM_UFlat/11            523997     510471        382 344.4MB/s  gaviota
BM_UIOVec/0            478443     465227        417 209.9MB/s  html
BM_UIOVec/1           4172860    4060000        100 164.9MB/s  urls
BM_UIOVec/2             21470      20975       7342 5.5GB/s  jpg
BM_UIOVec/3              1357       1330      75187 143.4MB/s  jpg_200
BM_UIOVec/4             63143      61365       3031 1.6GB/s  pdf
BM_UValidate/0          86910      85125       2279 1.1GB/s  html
BM_UValidate/1        1022256    1000000        195 669.6MB/s  urls
BM_UValidate/2            420        417     400000 274.6GB/s  jpg
BM_UValidate/3            311        302     571428 630.0MB/s  jpg_200
BM_UValidate/4           7778       7584      25445 12.6GB/s  pdf
BM_ZFlat/0             469209     457547        424 213.4MB/s  html (22.31 %)
BM_ZFlat/1            5633510    5460000        100 122.6MB/s  urls (47.78 %)
BM_ZFlat/2              37896      36693       4524 3.1GB/s  jpg (99.95 %)
BM_ZFlat/3               1485       1441     123456 132.3MB/s  jpg_200 (73.00 %)
BM_ZFlat/4              74870      72775       2652 1.3GB/s  pdf (83.30 %)
BM_ZFlat/5            1857321    1785714        112 218.8MB/s  html4 (22.52 %)
BM_ZFlat/6            1538723    1492307        130 97.2MB/s  txt1 (57.88 %)
BM_ZFlat/7            1338236    1310810        148 91.1MB/s  txt2 (61.91 %)
BM_ZFlat/8            4050820    4040000        100 100.7MB/s  txt3 (54.99 %)
BM_ZFlat/9            5234940    5230000        100 87.9MB/s  txt4 (66.26 %)
BM_ZFlat/10            400309     400000        495 282.7MB/s  pb (19.68 %)
BM_ZFlat/11           1063042    1058510        188 166.1MB/s  gaviota (37.72 %)
This commit is contained in:
costan 2017-08-16 12:38:06 -07:00 committed by Victor Costan
parent 77c12adc19
commit 632cd0f128
4 changed files with 26 additions and 23 deletions

View File

@ -8,10 +8,7 @@ option(BUILD_SHARED_LIBS "Build shared libraries(DLLs)." OFF)
option(SNAPPY_BUILD_TESTS "Build Snappy's own tests." ON)
include(TestBigEndian)
test_big_endian(WORDS_BIG_ENDIAN)
if(WORDS_BIG_ENDIAN)
add_definitions(-DWORDS_BIGENDIAN=1)
endif(WORDS_BIG_ENDIAN)
test_big_endian(SNAPPY_IS_BIG_ENDIAN)
include(CheckIncludeFile)
check_include_file("byteswap.h" HAVE_BYTESWAP_H)

View File

@ -55,8 +55,8 @@
/* Define to 1 if you have the <windows.h> header file. */
#cmakedefine HAVE_WINDOWS_H 1
/* Define WORDS_BIGENDIAN to 1 if your processor stores words with the most
significant byte first (like Motorola and SPARC, unlike Intel and VAX). */
#cmakedefine WORDS_BIGENDIAN 1
/* Define to 1 if your processor stores words with the most significant byte
first (like Motorola and SPARC, unlike Intel and VAX). */
#cmakedefine SNAPPY_IS_BIG_ENDIAN 1
#endif // THIRD_PARTY_SNAPPY_OPENSOURCE_CMAKE_CONFIG_H_

View File

@ -83,7 +83,8 @@ char* CompressFragment(const char* input,
// Requires that s2_limit >= s2.
//
// Separate implementation for 64-bit, little-endian cpus.
#if defined(ARCH_K8) || (defined(ARCH_PPC) && !defined(WORDS_BIGENDIAN))
#if !defined(SNAPPY_IS_BIG_ENDIAN) && \
(defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM))
static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
const char* s2,
const char* s2_limit) {

View File

@ -64,6 +64,10 @@
#define ARCH_PPC 1
#elif defined(__aarch64__)
#define ARCH_ARM 1
#endif
// Needed by OS X, among others.
@ -104,9 +108,10 @@ static const int64 kint64max = static_cast<int64>(0x7FFFFFFFFFFFFFFFLL);
// Potentially unaligned loads and stores.
// x86 and PowerPC can simply do these loads and stores native.
// x86, PowerPC, and ARM64 can simply do these loads and stores native.
#if defined(__i386__) || defined(__x86_64__) || defined(__powerpc__)
#if defined(__i386__) || defined(__x86_64__) || defined(__powerpc__) || \
defined(__aarch64__)
#define UNALIGNED_LOAD16(_p) (*reinterpret_cast<const uint16 *>(_p))
#define UNALIGNED_LOAD32(_p) (*reinterpret_cast<const uint32 *>(_p))
@ -234,7 +239,7 @@ inline void UNALIGNED_STORE64(void *p, uint64 v) {
#endif
// The following guarantees declaration of the byte swap functions.
#ifdef WORDS_BIGENDIAN
#if defined(SNAPPY_IS_BIG_ENDIAN)
#ifdef HAVE_SYS_BYTEORDER_H
#include <sys/byteorder.h>
@ -291,7 +296,7 @@ inline uint64 bswap_64(uint64 x) {
#endif
#endif // WORDS_BIGENDIAN
#endif // defined(SNAPPY_IS_BIG_ENDIAN)
// Convert to little-endian storage, opposite of network format.
// Convert x from host to little endian: x = LittleEndian.FromHost(x);
@ -305,7 +310,7 @@ inline uint64 bswap_64(uint64 x) {
class LittleEndian {
public:
// Conversion functions.
#ifdef WORDS_BIGENDIAN
#if defined(SNAPPY_IS_BIG_ENDIAN)
static uint16 FromHost16(uint16 x) { return bswap_16(x); }
static uint16 ToHost16(uint16 x) { return bswap_16(x); }
@ -315,7 +320,7 @@ class LittleEndian {
static bool IsLittleEndian() { return false; }
#else // !defined(WORDS_BIGENDIAN)
#else // !defined(SNAPPY_IS_BIG_ENDIAN)
static uint16 FromHost16(uint16 x) { return x; }
static uint16 ToHost16(uint16 x) { return x; }
@ -325,7 +330,7 @@ class LittleEndian {
static bool IsLittleEndian() { return true; }
#endif // !defined(WORDS_BIGENDIAN)
#endif // !defined(SNAPPY_IS_BIG_ENDIAN)
// Functions to do unaligned loads and stores in little-endian order.
static uint16 Load16(const void *p) {
@ -356,9 +361,9 @@ class Bits {
// that it's 0-indexed.
static int FindLSBSetNonZero(uint32 n);
#if defined(ARCH_K8) || defined(ARCH_PPC)
#if defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM)
static int FindLSBSetNonZero64(uint64 n);
#endif // defined(ARCH_K8) || defined(ARCH_PPC)
#endif // defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM)
private:
// No copying
@ -376,11 +381,11 @@ inline int Bits::FindLSBSetNonZero(uint32 n) {
return __builtin_ctz(n);
}
#if defined(ARCH_K8) || defined(ARCH_PPC)
#if defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM)
inline int Bits::FindLSBSetNonZero64(uint64 n) {
return __builtin_ctzll(n);
}
#endif // defined(ARCH_K8) || defined(ARCH_PPC)
#endif // defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM)
#elif defined(_MSC_VER)
@ -399,13 +404,13 @@ inline int Bits::FindLSBSetNonZero(uint32 n) {
return 32;
}
#if defined(ARCH_K8) || defined(ARCH_PPC)
#if defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM)
inline int Bits::FindLSBSetNonZero64(uint64 n) {
unsigned long where;
if (_BitScanForward64(&where, n)) return static_cast<int>(where);
return 64;
}
#endif // defined(ARCH_K8) || defined(ARCH_PPC)
#endif // defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM)
#else // Portable versions.
@ -439,7 +444,7 @@ inline int Bits::FindLSBSetNonZero(uint32 n) {
return rc;
}
#if defined(ARCH_K8) || defined(ARCH_PPC)
#if defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM)
// FindLSBSetNonZero64() is defined in terms of FindLSBSetNonZero().
inline int Bits::FindLSBSetNonZero64(uint64 n) {
const uint32 bottombits = static_cast<uint32>(n);
@ -450,7 +455,7 @@ inline int Bits::FindLSBSetNonZero64(uint64 n) {
return FindLSBSetNonZero(bottombits);
}
}
#endif // defined(ARCH_K8) || defined(ARCH_PPC)
#endif // defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM)
#endif // End portable versions.