mirror of https://github.com/google/snappy.git
optimize unaligned operator same as arm.
After optimze: M_UFlat/0 193373 192048 1017 508.5MB/s html BM_UFlat/1 1983820 1992190 100 336.1MB/s urls BM_UFlat/2 49493 49842 3997 2.3GB/s jpg BM_UFlat/3 962 943 190476 202.2MB/s jpg_200 BM_UFlat/4 47904 48501 4188 2.0GB/s pdf BM_UFlat/5 794704 796872 250 490.2MB/s html4 BM_UFlat/6 679119 677615 294 214.0MB/s txt1 BM_UFlat/7 589044 587666 339 203.1MB/s txt2 BM_UFlat/8 1823009 1827697 109 222.7MB/s txt3 BM_UFlat/9 2486760 2460940 100 186.7MB/s txt4 BM_UFlat/10 185680 187411 1063 603.5MB/s pb BM_UFlat/11 741161 734259 266 239.4MB/s gaviota BM_UIOVec/0 491849 490687 406 199.0MB/s html BM_UIOVec/1 4269690 4257810 100 157.3MB/s urls BM_UIOVec/2 53773 54106 3682 2.1GB/s jpg BM_UIOVec/3 2261 2255 86580 84.6MB/s jpg_200 BM_UIOVec/4 75054 75604 2635 1.3GB/s pdf BM_UValidate/0 109026 107788 1812 906.0MB/s html BM_UValidate/1 1247281 1245118 160 537.7MB/s urls BM_UValidate/2 727 720 243902 159.1GB/s jpg BM_UValidate/3 508 507 338983 376.2MB/s jpg_200 BM_UValidate/4 10552 10549 18885 9.0GB/s pdf BM_ZFlat/0 700492 701471 284 139.2MB/s html (22.31 %) BM_ZFlat/1 7840690 7812500 100 85.7MB/s urls (47.78 %) BM_ZFlat/2 60742 61091 3261 1.9GB/s jpg (99.95 %) BM_ZFlat/3 2971 2947 64935 64.7MB/s jpg_200 (73.00 %) BM_ZFlat/4 107158 105860 1845 922.5MB/s pdf (83.30 %) BM_ZFlat/5 2808390 2812500 100 138.9MB/s html4 (22.52 %) BM_ZFlat/6 2405510 2382810 100 60.9MB/s txt1 (57.88 %) BM_ZFlat/7 2084040 2070310 100 57.7MB/s txt2 (61.91 %) BM_ZFlat/8 6396990 6406250 100 63.5MB/s txt3 (54.99 %) BM_ZFlat/9 8521580 8515620 100 54.0MB/s txt4 (66.26 %) BM_ZFlat/10 665106 664063 300 170.3MB/s pb (19.68 %) BM_ZFlat/11 2192610 2187500 100 80.4MB/s gaviota (37.72 %) Before optimze: Benchmark Time(ns) CPU(ns) Iterations --------------------------------------------------- BM_UFlat/0 409250 408235 488 239.2MB/s html BM_UFlat/1 3779030 3750000 100 178.5MB/s urls BM_UFlat/2 50095 49446 3950 2.3GB/s jpg BM_UFlat/3 1342 1328 123456 143.5MB/s jpg_200 BM_UFlat/4 70687 71517 2731 1.3GB/s pdf BM_UFlat/5 1661500 1660150 120 235.3MB/s html4 BM_UFlat/6 1586744 1562496 125 92.8MB/s txt1 BM_UFlat/7 1341067 1337040 149 89.3MB/s txt2 BM_UFlat/8 4188730 4179690 100 97.4MB/s txt3 BM_UFlat/9 5595520 5585930 100 82.3MB/s txt4 BM_UFlat/10 369473 371677 536 304.3MB/s pb BM_UFlat/11 1830678 1827697 109 96.2MB/s gaviota BM_UIOVec/0 634178 634455 314 153.9MB/s html BM_UIOVec/1 5571610 5585940 100 119.9MB/s urls BM_UIOVec/2 54381 54610 3648 2.1GB/s jpg BM_UIOVec/3 2618 2587 72463 73.7MB/s jpg_200 BM_UIOVec/4 88890 89416 2228 1.1GB/s pdf BM_UValidate/0 230980 227372 859 429.5MB/s html BM_UValidate/1 2178450 2187500 100 306.1MB/s urls BM_UValidate/2 1112 1101 166666 104.1GB/s jpg BM_UValidate/3 692 682 263157 279.3MB/s jpg_200 BM_UValidate/4 23802 23898 8336 4.0GB/s pdf BM_ZFlat/0 4405980 4375000 100 22.3MB/s html (22.31 %) BM_ZFlat/1 52297430 52187500 100 12.8MB/s urls (47.78 %) BM_ZFlat/2 168751 169837 1173 691.2MB/s jpg (99.95 %) BM_ZFlat/3 21565 21716 8814 8.8MB/s jpg_200 (73.00 %) BM_ZFlat/4 706390 706446 282 138.2MB/s pdf (83.30 %) BM_ZFlat/5 17759550 17734370 100 22.0MB/s html4 (22.52 %) BM_ZFlat/6 12785910 12773440 100 11.4MB/s txt1 (57.88 %) BM_ZFlat/7 11020140 10976560 100 10.9MB/s txt2 (61.91 %) BM_ZFlat/8 34391200 34296880 100 11.9MB/s txt3 (54.99 %) BM_ZFlat/9 44832460 44726570 100 10.3MB/s txt4 (66.26 %) BM_ZFlat/10 4650820 4648440 100 24.3MB/s pb (19.68 %) BM_ZFlat/11 11624620 11601560 100 15.2MB/s gaviota (37.72 %)
This commit is contained in:
parent
ea660b57d6
commit
418a6e233c
|
@ -84,7 +84,7 @@ char* CompressFragment(const char* input,
|
|||
//
|
||||
// Separate implementation for 64-bit, little-endian cpus.
|
||||
#if !defined(SNAPPY_IS_BIG_ENDIAN) && \
|
||||
(defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM))
|
||||
(defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM)|| defined(ARCH_MIPS))
|
||||
static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
|
||||
const char* s2,
|
||||
const char* s2_limit) {
|
||||
|
|
|
@ -80,6 +80,10 @@
|
|||
|
||||
#define ARCH_ARM 1
|
||||
|
||||
#elif defined(__mips__)
|
||||
|
||||
#define ARCH_MIPS 1
|
||||
|
||||
#endif
|
||||
|
||||
// Needed by OS X, among others.
|
||||
|
@ -212,7 +216,77 @@ inline uint64 UNALIGNED_LOAD64(const void *p) {
|
|||
inline void UNALIGNED_STORE64(void *p, uint64 v) {
|
||||
memcpy(p, &v, sizeof v);
|
||||
}
|
||||
#elif defined(__mips64)
|
||||
|
||||
inline uint16 UNALIGNED_LOAD16(const void *p) {
|
||||
uint16 t;
|
||||
__asm__ volatile (
|
||||
".set noat \n\t"
|
||||
"lb %[t], 0x0(%[p]) \n\t"
|
||||
"lb $1, 0x1(%[p]) \n\t"
|
||||
"ins %[t], $1, 8, 8 \n\t"
|
||||
:[t]"=&r"(t)
|
||||
:[p]"r"(p)
|
||||
:
|
||||
);
|
||||
return t;
|
||||
}
|
||||
|
||||
inline void UNALIGNED_STORE16(void *p, uint16 v) {
|
||||
__asm__ volatile (
|
||||
".set noat \n\t"
|
||||
"sb %[v], 0x0(%[p]) \n\t"
|
||||
"srl $1, %[v], 8\n\t"
|
||||
"sb $1, 0x1(%[p]) \n\t"
|
||||
:
|
||||
:[p]"r"(p),[v]"r"(v)
|
||||
:
|
||||
);
|
||||
}
|
||||
|
||||
inline uint32 UNALIGNED_LOAD32(const void *p) {
|
||||
uint32 t;
|
||||
__asm__ volatile (
|
||||
"lwl %[t], 0x3(%[p]) \n\t"
|
||||
"lwr %[t], 0x0(%[p]) \n\t"
|
||||
:[t]"=&r"(t)
|
||||
:[p]"r"(p)
|
||||
:
|
||||
);
|
||||
return t;
|
||||
}
|
||||
|
||||
inline uint64 UNALIGNED_LOAD64(const void *p) {
|
||||
uint64 t;
|
||||
__asm__ volatile (
|
||||
"ldl %[temp], 0x7(%[p]) \n\t"
|
||||
"ldr %[temp], 0x0(%[p]) \n\t"
|
||||
:[temp]"=&r"(t)
|
||||
:[p]"r"(p)
|
||||
:
|
||||
);
|
||||
return t;
|
||||
}
|
||||
|
||||
inline void UNALIGNED_STORE32(void *p, uint32 v) {
|
||||
__asm__ volatile (
|
||||
"swl %[v], 0x3(%[p]) \n\t"
|
||||
"swr %[v], 0x0(%[p]) \n\t"
|
||||
:
|
||||
:[p]"r"(p),[v]"r"(v)
|
||||
:
|
||||
);
|
||||
}
|
||||
|
||||
inline void UNALIGNED_STORE64(void *p, uint64 v) {
|
||||
__asm__ volatile (
|
||||
"sdl %[v], 0x7(%[p]) \n\t"
|
||||
"sdr %[v], 0x0(%[p]) \n\t"
|
||||
:
|
||||
:[p]"r"(p),[v]"r"(v)
|
||||
:
|
||||
);
|
||||
}
|
||||
#else
|
||||
|
||||
// These functions are provided for architectures that don't support
|
||||
|
@ -343,7 +417,6 @@ class LittleEndian {
|
|||
static bool IsLittleEndian() { return true; }
|
||||
|
||||
#endif // !defined(SNAPPY_IS_BIG_ENDIAN)
|
||||
|
||||
// Functions to do unaligned loads and stores in little-endian order.
|
||||
static uint16 Load16(const void *p) {
|
||||
return ToHost16(UNALIGNED_LOAD16(p));
|
||||
|
@ -373,9 +446,9 @@ class Bits {
|
|||
// that it's 0-indexed.
|
||||
static int FindLSBSetNonZero(uint32 n);
|
||||
|
||||
#if defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM)
|
||||
#if defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM) || defined(ARCH_MIPS)
|
||||
static int FindLSBSetNonZero64(uint64 n);
|
||||
#endif // defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM)
|
||||
#endif // defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM) || defined(ARCH_MIPS)
|
||||
|
||||
private:
|
||||
// No copying
|
||||
|
@ -393,11 +466,11 @@ inline int Bits::FindLSBSetNonZero(uint32 n) {
|
|||
return __builtin_ctz(n);
|
||||
}
|
||||
|
||||
#if defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM)
|
||||
#if defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM) || defined(ARCH_MIPS)
|
||||
inline int Bits::FindLSBSetNonZero64(uint64 n) {
|
||||
return __builtin_ctzll(n);
|
||||
}
|
||||
#endif // defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM)
|
||||
#endif // defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM) || defined(ARCH_MIPS)
|
||||
|
||||
#elif defined(_MSC_VER)
|
||||
|
||||
|
@ -422,7 +495,7 @@ inline int Bits::FindLSBSetNonZero64(uint64 n) {
|
|||
if (_BitScanForward64(&where, n)) return static_cast<int>(where);
|
||||
return 64;
|
||||
}
|
||||
#endif // defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM)
|
||||
#endif // defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM) || defined(ARCH_MIPS)
|
||||
|
||||
#else // Portable versions.
|
||||
|
||||
|
|
15
snappy.cc
15
snappy.cc
|
@ -102,9 +102,22 @@ size_t MaxCompressedLength(size_t source_len) {
|
|||
namespace {
|
||||
|
||||
void UnalignedCopy64(const void* src, void* dst) {
|
||||
#if defined(__mips64)
|
||||
__asm__ volatile (
|
||||
".set noat \n\t"
|
||||
"ldl $1, 0x7(%[src]) \n\t"
|
||||
"ldr $1, 0x0(%[src]) \n\t"
|
||||
"sdl $1, 0x7(%[dst]) \n\t"
|
||||
"sdr $1, 0x0(%[dst]) \n\t"
|
||||
:
|
||||
:[src]"r"(src),[dst]"r"(dst)
|
||||
:
|
||||
);
|
||||
#else
|
||||
char tmp[8];
|
||||
memcpy(tmp, src, 8);
|
||||
memcpy(dst, tmp, 8);
|
||||
#endif
|
||||
}
|
||||
|
||||
void UnalignedCopy128(const void* src, void* dst) {
|
||||
|
@ -459,7 +472,7 @@ uint16* WorkingMemory::GetHashTable(size_t input_size, int* table_size) {
|
|||
// is done when GetEightBytesAt() is called, whereas for 32-bit, the load is
|
||||
// done at GetUint32AtOffset() time.
|
||||
|
||||
#ifdef ARCH_K8
|
||||
#if defined(ARCH_K8) || defined(ARCH_ARM) || defined(ARCH_MIPS)
|
||||
|
||||
typedef uint64 EightBytesReference;
|
||||
|
||||
|
|
Loading…
Reference in New Issue