optimize unaligned operator same as arm.

After optimze:
M_UFlat/0             193373     192048       1017 508.5MB/s  html
BM_UFlat/1            1983820    1992190        100 336.1MB/s  urls
BM_UFlat/2              49493      49842       3997 2.3GB/s  jpg
BM_UFlat/3                962        943     190476 202.2MB/s  jpg_200
BM_UFlat/4              47904      48501       4188 2.0GB/s  pdf
BM_UFlat/5             794704     796872        250 490.2MB/s  html4
BM_UFlat/6             679119     677615        294 214.0MB/s  txt1
BM_UFlat/7             589044     587666        339 203.1MB/s  txt2
BM_UFlat/8            1823009    1827697        109 222.7MB/s  txt3
BM_UFlat/9            2486760    2460940        100 186.7MB/s  txt4
BM_UFlat/10            185680     187411       1063 603.5MB/s  pb
BM_UFlat/11            741161     734259        266 239.4MB/s  gaviota
BM_UIOVec/0            491849     490687        406 199.0MB/s  html
BM_UIOVec/1           4269690    4257810        100 157.3MB/s  urls
BM_UIOVec/2             53773      54106       3682 2.1GB/s  jpg
BM_UIOVec/3              2261       2255      86580 84.6MB/s  jpg_200
BM_UIOVec/4             75054      75604       2635 1.3GB/s  pdf
BM_UValidate/0         109026     107788       1812 906.0MB/s  html
BM_UValidate/1        1247281    1245118        160 537.7MB/s  urls
BM_UValidate/2            727        720     243902 159.1GB/s  jpg
BM_UValidate/3            508        507     338983 376.2MB/s  jpg_200
BM_UValidate/4          10552      10549      18885 9.0GB/s  pdf
BM_ZFlat/0             700492     701471        284 139.2MB/s  html (22.31 %)
BM_ZFlat/1            7840690    7812500        100 85.7MB/s  urls (47.78 %)
BM_ZFlat/2              60742      61091       3261 1.9GB/s  jpg (99.95 %)
BM_ZFlat/3               2971       2947      64935 64.7MB/s  jpg_200 (73.00 %)
BM_ZFlat/4             107158     105860       1845 922.5MB/s  pdf (83.30 %)
BM_ZFlat/5            2808390    2812500        100 138.9MB/s  html4 (22.52 %)
BM_ZFlat/6            2405510    2382810        100 60.9MB/s  txt1 (57.88 %)
BM_ZFlat/7            2084040    2070310        100 57.7MB/s  txt2 (61.91 %)
BM_ZFlat/8            6396990    6406250        100 63.5MB/s  txt3 (54.99 %)
BM_ZFlat/9            8521580    8515620        100 54.0MB/s  txt4 (66.26 %)
BM_ZFlat/10            665106     664063        300 170.3MB/s  pb (19.68 %)
BM_ZFlat/11           2192610    2187500        100 80.4MB/s  gaviota (37.72 %)

Before optimze:
Benchmark            Time(ns)    CPU(ns) Iterations
---------------------------------------------------
BM_UFlat/0             409250     408235        488 239.2MB/s  html
BM_UFlat/1            3779030    3750000        100 178.5MB/s  urls
BM_UFlat/2              50095      49446       3950 2.3GB/s  jpg
BM_UFlat/3               1342       1328     123456 143.5MB/s  jpg_200
BM_UFlat/4              70687      71517       2731 1.3GB/s  pdf
BM_UFlat/5            1661500    1660150        120 235.3MB/s  html4
BM_UFlat/6            1586744    1562496        125 92.8MB/s  txt1
BM_UFlat/7            1341067    1337040        149 89.3MB/s  txt2
BM_UFlat/8            4188730    4179690        100 97.4MB/s  txt3
BM_UFlat/9            5595520    5585930        100 82.3MB/s  txt4
BM_UFlat/10            369473     371677        536 304.3MB/s  pb
BM_UFlat/11           1830678    1827697        109 96.2MB/s  gaviota
BM_UIOVec/0            634178     634455        314 153.9MB/s  html
BM_UIOVec/1           5571610    5585940        100 119.9MB/s  urls
BM_UIOVec/2             54381      54610       3648 2.1GB/s  jpg
BM_UIOVec/3              2618       2587      72463 73.7MB/s  jpg_200
BM_UIOVec/4             88890      89416       2228 1.1GB/s  pdf
BM_UValidate/0         230980     227372        859 429.5MB/s  html
BM_UValidate/1        2178450    2187500        100 306.1MB/s  urls
BM_UValidate/2           1112       1101     166666 104.1GB/s  jpg
BM_UValidate/3            692        682     263157 279.3MB/s  jpg_200
BM_UValidate/4          23802      23898       8336 4.0GB/s  pdf
BM_ZFlat/0            4405980    4375000        100 22.3MB/s  html (22.31 %)
BM_ZFlat/1           52297430   52187500        100 12.8MB/s  urls (47.78 %)
BM_ZFlat/2             168751     169837       1173 691.2MB/s  jpg (99.95 %)
BM_ZFlat/3              21565      21716       8814 8.8MB/s  jpg_200 (73.00 %)
BM_ZFlat/4             706390     706446        282 138.2MB/s  pdf (83.30 %)
BM_ZFlat/5           17759550   17734370        100 22.0MB/s  html4 (22.52 %)
BM_ZFlat/6           12785910   12773440        100 11.4MB/s  txt1 (57.88 %)
BM_ZFlat/7           11020140   10976560        100 10.9MB/s  txt2 (61.91 %)
BM_ZFlat/8           34391200   34296880        100 11.9MB/s  txt3 (54.99 %)
BM_ZFlat/9           44832460   44726570        100 10.3MB/s  txt4 (66.26 %)
BM_ZFlat/10           4650820    4648440        100 24.3MB/s  pb (19.68 %)
BM_ZFlat/11          11624620   11601560        100 15.2MB/s  gaviota (37.72 %)
This commit is contained in:
huangwenjun 2018-08-20 10:38:07 +08:00
parent ea660b57d6
commit 418a6e233c
3 changed files with 94 additions and 8 deletions

View File

@ -84,7 +84,7 @@ char* CompressFragment(const char* input,
// //
// Separate implementation for 64-bit, little-endian cpus. // Separate implementation for 64-bit, little-endian cpus.
#if !defined(SNAPPY_IS_BIG_ENDIAN) && \ #if !defined(SNAPPY_IS_BIG_ENDIAN) && \
(defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM)) (defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM)|| defined(ARCH_MIPS))
static inline std::pair<size_t, bool> FindMatchLength(const char* s1, static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
const char* s2, const char* s2,
const char* s2_limit) { const char* s2_limit) {

View File

@ -80,6 +80,10 @@
#define ARCH_ARM 1 #define ARCH_ARM 1
#elif defined(__mips__)
#define ARCH_MIPS 1
#endif #endif
// Needed by OS X, among others. // Needed by OS X, among others.
@ -212,7 +216,77 @@ inline uint64 UNALIGNED_LOAD64(const void *p) {
inline void UNALIGNED_STORE64(void *p, uint64 v) { inline void UNALIGNED_STORE64(void *p, uint64 v) {
memcpy(p, &v, sizeof v); memcpy(p, &v, sizeof v);
} }
#elif defined(__mips64)
inline uint16 UNALIGNED_LOAD16(const void *p) {
uint16 t;
__asm__ volatile (
".set noat \n\t"
"lb %[t], 0x0(%[p]) \n\t"
"lb $1, 0x1(%[p]) \n\t"
"ins %[t], $1, 8, 8 \n\t"
:[t]"=&r"(t)
:[p]"r"(p)
:
);
return t;
}
inline void UNALIGNED_STORE16(void *p, uint16 v) {
__asm__ volatile (
".set noat \n\t"
"sb %[v], 0x0(%[p]) \n\t"
"srl $1, %[v], 8\n\t"
"sb $1, 0x1(%[p]) \n\t"
:
:[p]"r"(p),[v]"r"(v)
:
);
}
inline uint32 UNALIGNED_LOAD32(const void *p) {
uint32 t;
__asm__ volatile (
"lwl %[t], 0x3(%[p]) \n\t"
"lwr %[t], 0x0(%[p]) \n\t"
:[t]"=&r"(t)
:[p]"r"(p)
:
);
return t;
}
inline uint64 UNALIGNED_LOAD64(const void *p) {
uint64 t;
__asm__ volatile (
"ldl %[temp], 0x7(%[p]) \n\t"
"ldr %[temp], 0x0(%[p]) \n\t"
:[temp]"=&r"(t)
:[p]"r"(p)
:
);
return t;
}
inline void UNALIGNED_STORE32(void *p, uint32 v) {
__asm__ volatile (
"swl %[v], 0x3(%[p]) \n\t"
"swr %[v], 0x0(%[p]) \n\t"
:
:[p]"r"(p),[v]"r"(v)
:
);
}
inline void UNALIGNED_STORE64(void *p, uint64 v) {
__asm__ volatile (
"sdl %[v], 0x7(%[p]) \n\t"
"sdr %[v], 0x0(%[p]) \n\t"
:
:[p]"r"(p),[v]"r"(v)
:
);
}
#else #else
// These functions are provided for architectures that don't support // These functions are provided for architectures that don't support
@ -343,7 +417,6 @@ class LittleEndian {
static bool IsLittleEndian() { return true; } static bool IsLittleEndian() { return true; }
#endif // !defined(SNAPPY_IS_BIG_ENDIAN) #endif // !defined(SNAPPY_IS_BIG_ENDIAN)
// Functions to do unaligned loads and stores in little-endian order. // Functions to do unaligned loads and stores in little-endian order.
static uint16 Load16(const void *p) { static uint16 Load16(const void *p) {
return ToHost16(UNALIGNED_LOAD16(p)); return ToHost16(UNALIGNED_LOAD16(p));
@ -373,9 +446,9 @@ class Bits {
// that it's 0-indexed. // that it's 0-indexed.
static int FindLSBSetNonZero(uint32 n); static int FindLSBSetNonZero(uint32 n);
#if defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM) #if defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM) || defined(ARCH_MIPS)
static int FindLSBSetNonZero64(uint64 n); static int FindLSBSetNonZero64(uint64 n);
#endif // defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM) #endif // defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM) || defined(ARCH_MIPS)
private: private:
// No copying // No copying
@ -393,11 +466,11 @@ inline int Bits::FindLSBSetNonZero(uint32 n) {
return __builtin_ctz(n); return __builtin_ctz(n);
} }
#if defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM) #if defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM) || defined(ARCH_MIPS)
inline int Bits::FindLSBSetNonZero64(uint64 n) { inline int Bits::FindLSBSetNonZero64(uint64 n) {
return __builtin_ctzll(n); return __builtin_ctzll(n);
} }
#endif // defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM) #endif // defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM) || defined(ARCH_MIPS)
#elif defined(_MSC_VER) #elif defined(_MSC_VER)
@ -422,7 +495,7 @@ inline int Bits::FindLSBSetNonZero64(uint64 n) {
if (_BitScanForward64(&where, n)) return static_cast<int>(where); if (_BitScanForward64(&where, n)) return static_cast<int>(where);
return 64; return 64;
} }
#endif // defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM) #endif // defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM) || defined(ARCH_MIPS)
#else // Portable versions. #else // Portable versions.

View File

@ -102,9 +102,22 @@ size_t MaxCompressedLength(size_t source_len) {
namespace { namespace {
void UnalignedCopy64(const void* src, void* dst) { void UnalignedCopy64(const void* src, void* dst) {
#if defined(__mips64)
__asm__ volatile (
".set noat \n\t"
"ldl $1, 0x7(%[src]) \n\t"
"ldr $1, 0x0(%[src]) \n\t"
"sdl $1, 0x7(%[dst]) \n\t"
"sdr $1, 0x0(%[dst]) \n\t"
:
:[src]"r"(src),[dst]"r"(dst)
:
);
#else
char tmp[8]; char tmp[8];
memcpy(tmp, src, 8); memcpy(tmp, src, 8);
memcpy(dst, tmp, 8); memcpy(dst, tmp, 8);
#endif
} }
void UnalignedCopy128(const void* src, void* dst) { void UnalignedCopy128(const void* src, void* dst) {
@ -459,7 +472,7 @@ uint16* WorkingMemory::GetHashTable(size_t input_size, int* table_size) {
// is done when GetEightBytesAt() is called, whereas for 32-bit, the load is // is done when GetEightBytesAt() is called, whereas for 32-bit, the load is
// done at GetUint32AtOffset() time. // done at GetUint32AtOffset() time.
#ifdef ARCH_K8 #if defined(ARCH_K8) || defined(ARCH_ARM) || defined(ARCH_MIPS)
typedef uint64 EightBytesReference; typedef uint64 EightBytesReference;