Re-work fast path for handling copies in zippy decompression.

This is a performance-tuning change that shouldn't change the behavior
of the library.

This adds some complexity but the performance gain might make that
worthwhile: With FDO on perflab/haswell, a 4.0% gain (geometric mean).

SAMPLE (before)

Benchmark         Time(ns)    CPU(ns) Iterations
------------------------------------------------
BM_UFlat/0           36638      36552     100000 2.6GB/s  html
BM_UFlat/1          457153     455895       9173 1.4GB/s  urls
BM_UFlat/2            5850       5837     685481 19.6GB/s  jpg
BM_UFlat/3             122        122   34551988 1.5GB/s  jpg_200
BM_UFlat/4            6797       6781     620811 14.1GB/s  pdf
BM_UFlat/5          179485     179037      23471 2.1GB/s  html4
BM_UFlat/6          142734     142384      29525 1018.7MB/s  txt1
BM_UFlat/7          125233     124924      33709 955.6MB/s  txt2
BM_UFlat/8          382548     381533      10000 1066.7MB/s  txt3
BM_UFlat/9          525614     524297       8018 876.5MB/s  txt4
BM_UFlat/10          34946      34868     100000 3.2GB/s  pb
BM_UFlat/11         149548     149208      28063 1.2GB/s  gaviota
BM_UFlat/12          10684      10663     392580 2.1GB/s  cp
BM_UFlat/13           5494       5484     766584 1.9GB/s  c
BM_UFlat/14           1691       1688    2488784 2.1GB/s  lsp
BM_UFlat/15         676443     674726       6129 1.4GB/s  xls
BM_UFlat/16            156        156   26656909 1.2GB/s  xls_200
BM_UFlat/17         239911     239297      17558 2.0GB/s  bin
BM_UFlat/18            182        182   23072932 1047.9MB/s  bin_200
BM_UFlat/19          21544      21499     194484 1.7GB/s  sum
BM_UFlat/20           2236       2232    1877810 1.8GB/s  man
BM_UFlatSink/0       42266      42179      99732 2.3GB/s  html
BM_UFlatSink/1      461810     460633       9055 1.4GB/s  urls
BM_UFlatSink/2        5816       5804     632829 19.8GB/s  jpg
BM_UFlatSink/3         124        123   34351698 1.5GB/s  jpg_200
BM_UFlatSink/4        7173       7157     609929 13.3GB/s  pdf
BM_UFlatSink/5      184795     184302      22660 2.1GB/s  html4
BM_UFlatSink/6      143552     143223      29272 1012.7MB/s  txt1
BM_UFlatSink/7      127160     126890      33178 940.8MB/s  txt2
BM_UFlatSink/8      382219     381313      10000 1067.3MB/s  txt3
BM_UFlatSink/9      528042     526713       7988 872.5MB/s  txt4
BM_UFlatSink/10      41389      41305     100000 2.7GB/s  pb
BM_UFlatSink/11     147215     146877      28854 1.2GB/s  gaviota
BM_UFlatSink/12      12008      11984     348139 1.9GB/s  cp
BM_UFlatSink/13       5444       5433     775084 1.9GB/s  c
BM_UFlatSink/14       1647       1644    2552119 2.1GB/s  lsp
BM_UFlatSink/15     665011     663424       6320 1.4GB/s  xls
BM_UFlatSink/16        153        153   27571837 1.2GB/s  xls_200
BM_UFlatSink/17     239735     239169      17411 2.0GB/s  bin
BM_UFlatSink/18        183        182   23005573 1046.8MB/s  bin_200
BM_UFlatSink/19      22544      22498     187705 1.6GB/s  sum
BM_UFlatSink/20       2190       2186    1917894 1.8GB/s  man

SAMPLE (after)

Benchmark         Time(ns)    CPU(ns) Iterations
------------------------------------------------
BM_UFlat/0           33940      33889     100000 2.8GB/s  html
BM_UFlat/1          440728     439944       9586 1.5GB/s  urls
BM_UFlat/2            5652       5641     744776 20.3GB/s  jpg
BM_UFlat/3             123        123   34647884 1.5GB/s  jpg_200
BM_UFlat/4            6628       6615     631892 14.4GB/s  pdf
BM_UFlat/5          169523     169227      24197 2.3GB/s  html4
BM_UFlat/6          144139     143892      29232 1008.0MB/s  txt1
BM_UFlat/7          127148     126915      33144 940.6MB/s  txt2
BM_UFlat/8          380267     379233      10000 1073.2MB/s  txt3
BM_UFlat/9          529495     528194       7957 870.0MB/s  txt4
BM_UFlat/10          31844      31784     100000 3.5GB/s  pb
BM_UFlat/11         146822     146476      28737 1.2GB/s  gaviota
BM_UFlat/12          10784      10762     392176 2.1GB/s  cp
BM_UFlat/13           5528       5518     760934 1.9GB/s  c
BM_UFlat/14           1721       1719    2449291 2.0GB/s  lsp
BM_UFlat/15         673304     671774       6255 1.4GB/s  xls
BM_UFlat/16            155        155   27092003 1.2GB/s  xls_200
BM_UFlat/17         230424     229902      18285 2.1GB/s  bin
BM_UFlat/18            185        184   22818199 1033.9MB/s  bin_200
BM_UFlat/19          21035      20996     200765 1.7GB/s  sum
BM_UFlat/20           2242       2238    1864380 1.8GB/s  man
BM_UFlatSink/0       33487      33405     100000 2.9GB/s  html
BM_UFlatSink/1      431108     430226       9764 1.5GB/s  urls
BM_UFlatSink/2        5927       5916     648112 19.4GB/s  jpg
BM_UFlatSink/3         123        122   34704423 1.5GB/s  jpg_200
BM_UFlatSink/4        6472       6461     653462 14.8GB/s  pdf
BM_UFlatSink/5      164309     163988      25567 2.3GB/s  html4
BM_UFlatSink/6      138274     138020      30311 1050.9MB/s  txt1
BM_UFlatSink/7      120844     120637      34708 989.6MB/s  txt2
BM_UFlatSink/8      371046     370366      10000 1098.9MB/s  txt3
BM_UFlatSink/9      510021     508982       8269 902.9MB/s  txt4
BM_UFlatSink/10      30889      30844     100000 3.6GB/s  pb
BM_UFlatSink/11     140752     140521      29903 1.2GB/s  gaviota
BM_UFlatSink/12      10162      10146     413600 2.3GB/s  cp
BM_UFlatSink/13       5264       5256     762398 2.0GB/s  c
BM_UFlatSink/14       1622       1619    2606069 2.1GB/s  lsp
BM_UFlatSink/15     646897     645756       6512 1.5GB/s  xls
BM_UFlatSink/16        150        150   28223595 1.2GB/s  xls_200
BM_UFlatSink/17     226096     225650      18629 2.1GB/s  bin
BM_UFlatSink/18        185        184   22907935 1035.3MB/s  bin_200
BM_UFlatSink/19      21369      21335     198881 1.7GB/s  sum
BM_UFlatSink/20       2139       2136    1953637 1.8GB/s  man
This commit is contained in:
Geoff Pike 2016-07-06 08:24:48 -07:00 committed by Alkis Evlogimenos
parent 4a74094080
commit 27c5d86527
1 changed files with 30 additions and 8 deletions

View File

@ -1067,19 +1067,41 @@ class SnappyArrayWriter {
if (produced <= offset - 1u) {
return false;
}
if (len <= 16 && offset >= 8 && space_left >= 16) {
// Fast path, used for the majority (70-80%) of dynamic invocations.
if (offset >= 8 && space_left >= 16) {
UnalignedCopy64(op - offset, op);
UnalignedCopy64(op - offset + 8, op + 8);
} else {
if (space_left >= len + kMaxIncrementCopyOverflow) {
IncrementalCopyFastPath(op - offset, op, len);
if (PREDICT_TRUE(len <= 16)) {
// Fast path, used for the majority (70-80%) of dynamic invocations.
op_ = op + len;
return true;
}
op += 16;
// Copy 8 bytes at a time. This will write as many as 7 bytes more
// than necessary, so we check if space_left >= len + 7.
if (space_left >= len + 7) {
const char* src = op - offset;
ssize_t l = len - 16; // 16 bytes were already handled, above.
do {
UnalignedCopy64(src, op);
src += 8;
op += 8;
l -= 8;
} while (l > 0);
// l is now negative if we wrote extra bytes; adjust op_ accordingly.
op_ = op + l;
return true;
} else if (space_left < len) {
return false;
} else {
if (space_left < len) {
return false;
}
len -= 16;
IncrementalCopy(op - offset, op, len);
}
} else if (space_left >= len + kMaxIncrementCopyOverflow) {
IncrementalCopyFastPath(op - offset, op, len);
} else if (space_left < len) {
return false;
} else {
IncrementalCopy(op - offset, op, len);
}
op_ = op + len;