mirror of https://github.com/google/snappy.git
Speed up zippy decompression by removing some zero-extensions.
This is a performance tuning change that should not affect correctness. On perflab with FDO on Haswell the performance gain is 21,776ns before vs 21,255ns after, about 2.4%. (Using geometric means.) SAMPLE PERFORMANCE with FDO on HASWELL (NEW) Benchmark Time(ns) CPU(ns) Iterations ------------------------------------------------ BM_UFlat/0 37366 37279 100000 2.6GB/s html BM_UFlat/1 471153 470204 8975 1.4GB/s urls BM_UFlat/2 6116 6105 639496 18.8GB/s jpg BM_UFlat/3 123 123 34709908 1.5GB/s jpg_200 BM_UFlat/4 6724 6714 623318 14.2GB/s pdf BM_UFlat/5 183122 182722 23138 2.1GB/s html4 BM_UFlat/6 144981 144689 29384 1002.5MB/s txt1 BM_UFlat/7 125939 125691 33423 949.8MB/s txt2 BM_UFlat/8 383101 382241 10000 1064.7MB/s txt3 BM_UFlat/9 527824 526606 7958 872.6MB/s txt4 BM_UFlat/10 34849 34790 100000 3.2GB/s pb BM_UFlat/11 150213 149937 28131 1.1GB/s gaviota BM_UFlat/12 10850 10830 393231 2.1GB/s cp BM_UFlat/13 5532 5523 735739 1.9GB/s c BM_UFlat/14 1698 1695 2478035 2.0GB/s lsp BM_UFlat/15 678396 676917 6200 1.4GB/s xls BM_UFlat/16 155 155 26909789 1.2GB/s xls_200 BM_UFlat/17 241235 240698 17416 2.0GB/s bin BM_UFlat/18 183 183 23000841 1043.5MB/s bin_200 BM_UFlat/19 21461 21424 193275 1.7GB/s sum BM_UFlat/20 2232 2228 1887191 1.8GB/s man BM_UFlatSink/0 42272 42199 98528 2.3GB/s html BM_UFlatSink/1 460814 459898 9092 1.4GB/s urls BM_UFlatSink/2 5558 5547 768629 20.7GB/s jpg BM_UFlatSink/3 124 123 33629141 1.5GB/s jpg_200 BM_UFlatSink/4 6634 6621 629989 14.4GB/s pdf BM_UFlatSink/5 182883 182491 23030 2.1GB/s html4 BM_UFlatSink/6 143269 142964 29410 1014.5MB/s txt1 BM_UFlatSink/7 127041 126809 33136 941.4MB/s txt2 BM_UFlatSink/8 384367 383577 10000 1061.0MB/s txt3 BM_UFlatSink/9 529979 528890 7898 868.9MB/s txt4 BM_UFlatSink/10 41154 41075 100000 2.7GB/s pb BM_UFlatSink/11 146446 146155 28742 1.2GB/s gaviota BM_UFlatSink/12 11939 11918 352663 1.9GB/s cp BM_UFlatSink/13 5430 5421 770451 1.9GB/s c BM_UFlatSink/14 1665 1662 2538921 2.1GB/s lsp BM_UFlatSink/15 666840 665617 6309 1.4GB/s xls BM_UFlatSink/16 152 152 27639460 1.2GB/s xls_200 BM_UFlatSink/17 240076 239573 17643 2.0GB/s bin BM_UFlatSink/18 183 182 23128210 1046.0MB/s bin_200 BM_UFlatSink/19 22570 22528 185839 1.6GB/s sum BM_UFlatSink/20 2183 2180 1899526 1.8GB/s man SAMPLE PERFORMANCE with FDO on HASWELL (OLD) Benchmark Time(ns) CPU(ns) Iterations ------------------------------------------------ BM_UFlat/0 37041 36990 100000 2.6GB/s html BM_UFlat/1 471384 470574 8930 1.4GB/s urls BM_UFlat/2 5997 5986 722354 19.2GB/s jpg BM_UFlat/3 124 123 34964717 1.5GB/s jpg_200 BM_UFlat/4 6850 6838 621414 13.9GB/s pdf BM_UFlat/5 182578 182271 23001 2.1GB/s html4 BM_UFlat/6 148338 147989 28132 980.1MB/s txt1 BM_UFlat/7 130682 130471 32347 915.0MB/s txt2 BM_UFlat/8 397420 396553 10000 1026.3MB/s txt3 BM_UFlat/9 550126 548872 7736 837.2MB/s txt4 BM_UFlat/10 35013 34958 100000 3.2GB/s pb BM_UFlat/11 152270 151889 27508 1.1GB/s gaviota BM_UFlat/12 11117 11096 379059 2.1GB/s cp BM_UFlat/13 5812 5801 725240 1.8GB/s c BM_UFlat/14 1780 1777 2383982 2.0GB/s lsp BM_UFlat/15 707871 706139 5946 1.4GB/s xls BM_UFlat/16 157 157 26889747 1.2GB/s xls_200 BM_UFlat/17 239160 238556 17512 2.0GB/s bin BM_UFlat/18 181 180 23326040 1057.5MB/s bin_200 BM_UFlat/19 22706 22656 186285 1.6GB/s sum BM_UFlat/20 2319 2315 1813186 1.7GB/s man BM_UFlatSink/0 42657 42574 99000 2.2GB/s html BM_UFlatSink/1 466316 465262 9036 1.4GB/s urls BM_UFlatSink/2 6873 6859 648525 16.7GB/s jpg BM_UFlatSink/3 124 124 34434643 1.5GB/s jpg_200 BM_UFlatSink/4 6804 6790 624282 14.0GB/s pdf BM_UFlatSink/5 185468 185062 22746 2.1GB/s html4 BM_UFlatSink/6 148511 148209 28284 978.6MB/s txt1 BM_UFlatSink/7 130865 130607 32144 914.0MB/s txt2 BM_UFlatSink/8 393931 392983 10000 1035.6MB/s txt3 BM_UFlatSink/9 545548 544275 7740 844.3MB/s txt4 BM_UFlatSink/10 41659 41584 100000 2.7GB/s pb BM_UFlatSink/11 152062 151721 27854 1.1GB/s gaviota BM_UFlatSink/12 11987 11968 350909 1.9GB/s cp BM_UFlatSink/13 5652 5641 743280 1.8GB/s c BM_UFlatSink/14 1728 1725 2446140 2.0GB/s lsp BM_UFlatSink/15 687879 686231 6138 1.4GB/s xls BM_UFlatSink/16 155 155 27254484 1.2GB/s xls_200 BM_UFlatSink/17 240689 240083 17450 2.0GB/s bin BM_UFlatSink/18 183 182 22932858 1046.8MB/s bin_200 BM_UFlatSink/19 22718 22674 185207 1.6GB/s sum BM_UFlatSink/20 2272 2268 1851664 1.7GB/s man
This commit is contained in:
parent
e788e527d3
commit
fce661fa8c
|
@ -614,15 +614,15 @@ class SnappyDecompressor {
|
|||
ip += literal_length;
|
||||
MAYBE_REFILL();
|
||||
} else {
|
||||
const uint32 entry = char_table[c];
|
||||
const uint32 trailer = LittleEndian::Load32(ip) & wordmask[entry >> 11];
|
||||
const uint32 length = entry & 0xff;
|
||||
const size_t entry = char_table[c];
|
||||
const size_t trailer = LittleEndian::Load32(ip) & wordmask[entry >> 11];
|
||||
const size_t length = entry & 0xff;
|
||||
ip += entry >> 11;
|
||||
|
||||
// copy_offset/256 is encoded in bits 8..10. By just fetching
|
||||
// those bits, we get copy_offset (since the bit-field starts at
|
||||
// bit 8).
|
||||
const uint32 copy_offset = entry & 0x700;
|
||||
const size_t copy_offset = entry & 0x700;
|
||||
if (!writer->AppendFromSelf(copy_offset + trailer, length)) {
|
||||
return;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue