diff --git a/ChangeLog b/ChangeLog index 10aa968..b585fc0 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,271 @@ +------------------------------------------------------------------------ +r80 | snappy.mirrorbot@gmail.com | 2013-08-13 14:55:00 +0200 (Tue, 13 Aug 2013) | 6 lines + +Add autoconf tests for size_t and ssize_t. Sort-of resolves public issue 79; +it would solve the problem if MSVC typically used autoconf. However, it gives +a natural place (config.h) to put the typedef even for MSVC. + +R=jsbell + +------------------------------------------------------------------------ +r79 | snappy.mirrorbot@gmail.com | 2013-07-29 13:06:44 +0200 (Mon, 29 Jul 2013) | 14 lines + +When we compare the number of bytes produced with the offset for a +backreference, make the signedness of the bytes produced clear, +by sticking it into a size_t. This avoids a signed/unsigned compare +warning from MSVC (public issue 71), and also is slightly clearer. + +Since the line is now so long the explanatory comment about the -1u +trick has to go somewhere else anyway, I used the opportunity to +explain it in slightly more detail. + +This is a purely stylistic change; the emitted assembler from GCC +is identical. + +R=jeff + +------------------------------------------------------------------------ +r78 | snappy.mirrorbot@gmail.com | 2013-06-30 21:24:03 +0200 (Sun, 30 Jun 2013) | 111 lines + +In the fast path for decompressing literals, instead of checking +whether there's 16 bytes free and then checking right afterwards +(when having subtracted the literal size) that there are now +5 bytes free, just check once for 21 bytes. This skips a compare +and a branch; although it is easily predictable, it is still +a few cycles on a fast path that we would like to get rid of. + +Benchmarking this yields very confusing results. On open-source +GCC 4.8.1 on Haswell, we get exactly the expected results; the +benchmarks where we hit the fast path for literals (in particular +the two HTML benchmarks and the protobuf benchmark) give very nice +speedups, and the others are not really affected. + +However, benchmarks with Google's GCC branch on other hardware +is much less clear. It seems that we have a weak loss in some cases +(and the win for the “typical” win cases are not nearly as clear), +but that it depends on microarchitecture and plain luck in how we run +the benchmark. Looking at the generated assembler, it seems that +the removal of the if causes other large-scale changes in how the +function is laid out, which makes it likely that this is just bad luck. + +Thus, we should keep this change, even though its exact current impact is +unclear; it's a sensible change per se, and dropping it on the basis of +microoptimization for a given compiler (or even branch of a compiler) +would seem like a bad strategy in the long run. + +Microbenchmark results (all in 64-bit, opt mode): + + Nehalem, Google GCC: + + Benchmark Base (ns) New (ns) Improvement + ------------------------------------------------------------------------------ + BM_UFlat/0 76747 75591 1.3GB/s html +1.5% + BM_UFlat/1 765756 757040 886.3MB/s urls +1.2% + BM_UFlat/2 10867 10893 10.9GB/s jpg -0.2% + BM_UFlat/3 124 131 1.4GB/s jpg_200 -5.3% + BM_UFlat/4 31663 31596 2.8GB/s pdf +0.2% + BM_UFlat/5 314162 308176 1.2GB/s html4 +1.9% + BM_UFlat/6 29668 29746 790.6MB/s cp -0.3% + BM_UFlat/7 12958 13386 796.4MB/s c -3.2% + BM_UFlat/8 3596 3682 966.0MB/s lsp -2.3% + BM_UFlat/9 1019193 1033493 953.3MB/s xls -1.4% + BM_UFlat/10 239 247 775.3MB/s xls_200 -3.2% + BM_UFlat/11 236411 240271 606.9MB/s txt1 -1.6% + BM_UFlat/12 206639 209768 571.2MB/s txt2 -1.5% + BM_UFlat/13 627803 635722 641.4MB/s txt3 -1.2% + BM_UFlat/14 845932 857816 538.2MB/s txt4 -1.4% + BM_UFlat/15 402107 391670 1.2GB/s bin +2.7% + BM_UFlat/16 283 279 683.6MB/s bin_200 +1.4% + BM_UFlat/17 46070 46815 781.5MB/s sum -1.6% + BM_UFlat/18 5053 5163 782.0MB/s man -2.1% + BM_UFlat/19 79721 76581 1.4GB/s pb +4.1% + BM_UFlat/20 251158 252330 697.5MB/s gaviota -0.5% + Sum of all benchmarks 4966150 4980396 -0.3% + + + Sandy Bridge, Google GCC: + + Benchmark Base (ns) New (ns) Improvement + ------------------------------------------------------------------------------ + BM_UFlat/0 42850 42182 2.3GB/s html +1.6% + BM_UFlat/1 525660 515816 1.3GB/s urls +1.9% + BM_UFlat/2 7173 7283 16.3GB/s jpg -1.5% + BM_UFlat/3 92 91 2.1GB/s jpg_200 +1.1% + BM_UFlat/4 15147 14872 5.9GB/s pdf +1.8% + BM_UFlat/5 199936 192116 2.0GB/s html4 +4.1% + BM_UFlat/6 12796 12443 1.8GB/s cp +2.8% + BM_UFlat/7 6588 6400 1.6GB/s c +2.9% + BM_UFlat/8 2010 1951 1.8GB/s lsp +3.0% + BM_UFlat/9 761124 763049 1.3GB/s xls -0.3% + BM_UFlat/10 186 189 1016.1MB/s xls_200 -1.6% + BM_UFlat/11 159354 158460 918.6MB/s txt1 +0.6% + BM_UFlat/12 139732 139950 856.1MB/s txt2 -0.2% + BM_UFlat/13 429917 425027 961.7MB/s txt3 +1.2% + BM_UFlat/14 585255 587324 785.8MB/s txt4 -0.4% + BM_UFlat/15 276186 266173 1.8GB/s bin +3.8% + BM_UFlat/16 205 207 925.5MB/s bin_200 -1.0% + BM_UFlat/17 24925 24935 1.4GB/s sum -0.0% + BM_UFlat/18 2632 2576 1.5GB/s man +2.2% + BM_UFlat/19 40546 39108 2.8GB/s pb +3.7% + BM_UFlat/20 175803 168209 1048.9MB/s gaviota +4.5% + Sum of all benchmarks 3408117 3368361 +1.2% + + + Haswell, upstream GCC 4.8.1: + + Benchmark Base (ns) New (ns) Improvement + ------------------------------------------------------------------------------ + BM_UFlat/0 46308 40641 2.3GB/s html +13.9% + BM_UFlat/1 513385 514706 1.3GB/s urls -0.3% + BM_UFlat/2 6197 6151 19.2GB/s jpg +0.7% + BM_UFlat/3 61 61 3.0GB/s jpg_200 +0.0% + BM_UFlat/4 13551 13429 6.5GB/s pdf +0.9% + BM_UFlat/5 198317 190243 2.0GB/s html4 +4.2% + BM_UFlat/6 14768 12560 1.8GB/s cp +17.6% + BM_UFlat/7 6453 6447 1.6GB/s c +0.1% + BM_UFlat/8 1991 1980 1.8GB/s lsp +0.6% + BM_UFlat/9 766947 770424 1.2GB/s xls -0.5% + BM_UFlat/10 170 169 1.1GB/s xls_200 +0.6% + BM_UFlat/11 164350 163554 888.7MB/s txt1 +0.5% + BM_UFlat/12 145444 143830 832.1MB/s txt2 +1.1% + BM_UFlat/13 437849 438413 929.2MB/s txt3 -0.1% + BM_UFlat/14 603587 605309 759.8MB/s txt4 -0.3% + BM_UFlat/15 249799 248067 1.9GB/s bin +0.7% + BM_UFlat/16 191 188 1011.4MB/s bin_200 +1.6% + BM_UFlat/17 26064 24778 1.4GB/s sum +5.2% + BM_UFlat/18 2620 2601 1.5GB/s man +0.7% + BM_UFlat/19 44551 37373 3.0GB/s pb +19.2% + BM_UFlat/20 165408 164584 1.0GB/s gaviota +0.5% + Sum of all benchmarks 3408011 3385508 +0.7% + +------------------------------------------------------------------------ +r77 | snappy.mirrorbot@gmail.com | 2013-06-14 23:42:26 +0200 (Fri, 14 Jun 2013) | 92 lines + +Make the two IncrementalCopy* functions take in an ssize_t instead of a len, +in order to avoid having to do 32-to-64-bit signed conversions on a hot path +during decompression. (Also fixes some MSVC warnings, mentioned in public +issue 75, but more of those remain.) They cannot be size_t because we expect +them to go negative and test for that. + +This saves a few movzwl instructions, yielding ~2% speedup in decompression. + + +Sandy Bridge: + +Benchmark Base (ns) New (ns) Improvement +------------------------------------------------------------------------------------------------- +BM_UFlat/0 48009 41283 2.3GB/s html +16.3% +BM_UFlat/1 531274 513419 1.3GB/s urls +3.5% +BM_UFlat/2 7378 7062 16.8GB/s jpg +4.5% +BM_UFlat/3 92 92 2.0GB/s jpg_200 +0.0% +BM_UFlat/4 15057 14974 5.9GB/s pdf +0.6% +BM_UFlat/5 204323 193140 2.0GB/s html4 +5.8% +BM_UFlat/6 13282 12611 1.8GB/s cp +5.3% +BM_UFlat/7 6511 6504 1.6GB/s c +0.1% +BM_UFlat/8 2014 2030 1.7GB/s lsp -0.8% +BM_UFlat/9 775909 768336 1.3GB/s xls +1.0% +BM_UFlat/10 182 184 1043.2MB/s xls_200 -1.1% +BM_UFlat/11 167352 161630 901.2MB/s txt1 +3.5% +BM_UFlat/12 147393 142246 842.8MB/s txt2 +3.6% +BM_UFlat/13 449960 432853 944.4MB/s txt3 +4.0% +BM_UFlat/14 620497 594845 775.9MB/s txt4 +4.3% +BM_UFlat/15 265610 267356 1.8GB/s bin -0.7% +BM_UFlat/16 206 205 932.7MB/s bin_200 +0.5% +BM_UFlat/17 25561 24730 1.4GB/s sum +3.4% +BM_UFlat/18 2620 2644 1.5GB/s man -0.9% +BM_UFlat/19 45766 38589 2.9GB/s pb +18.6% +BM_UFlat/20 171107 169832 1039.5MB/s gaviota +0.8% +Sum of all benchmarks 3500103 3394565 +3.1% + + +Westmere: + +Benchmark Base (ns) New (ns) Improvement +------------------------------------------------------------------------------------------------- +BM_UFlat/0 72624 71526 1.3GB/s html +1.5% +BM_UFlat/1 735821 722917 930.8MB/s urls +1.8% +BM_UFlat/2 10450 10172 11.7GB/s jpg +2.7% +BM_UFlat/3 117 117 1.6GB/s jpg_200 +0.0% +BM_UFlat/4 29817 29648 3.0GB/s pdf +0.6% +BM_UFlat/5 297126 293073 1.3GB/s html4 +1.4% +BM_UFlat/6 28252 27994 842.0MB/s cp +0.9% +BM_UFlat/7 12672 12391 862.1MB/s c +2.3% +BM_UFlat/8 3507 3425 1040.9MB/s lsp +2.4% +BM_UFlat/9 1004268 969395 1018.0MB/s xls +3.6% +BM_UFlat/10 233 227 844.8MB/s xls_200 +2.6% +BM_UFlat/11 230054 224981 647.8MB/s txt1 +2.3% +BM_UFlat/12 201229 196447 610.5MB/s txt2 +2.4% +BM_UFlat/13 609547 596761 685.3MB/s txt3 +2.1% +BM_UFlat/14 824362 804821 573.8MB/s txt4 +2.4% +BM_UFlat/15 371095 374899 1.3GB/s bin -1.0% +BM_UFlat/16 267 267 717.8MB/s bin_200 +0.0% +BM_UFlat/17 44623 43828 835.9MB/s sum +1.8% +BM_UFlat/18 5077 4815 841.0MB/s man +5.4% +BM_UFlat/19 74964 73210 1.5GB/s pb +2.4% +BM_UFlat/20 237987 236745 746.0MB/s gaviota +0.5% +Sum of all benchmarks 4794092 4697659 +2.1% + + +Istanbul: + +Benchmark Base (ns) New (ns) Improvement +------------------------------------------------------------------------------------------------- +BM_UFlat/0 98614 96376 1020.4MB/s html +2.3% +BM_UFlat/1 963740 953241 707.2MB/s urls +1.1% +BM_UFlat/2 25042 24769 4.8GB/s jpg +1.1% +BM_UFlat/3 180 180 1065.6MB/s jpg_200 +0.0% +BM_UFlat/4 45942 45403 1.9GB/s pdf +1.2% +BM_UFlat/5 400135 390226 1008.2MB/s html4 +2.5% +BM_UFlat/6 37768 37392 631.9MB/s cp +1.0% +BM_UFlat/7 18585 18200 588.2MB/s c +2.1% +BM_UFlat/8 5751 5690 627.7MB/s lsp +1.1% +BM_UFlat/9 1543154 1542209 641.4MB/s xls +0.1% +BM_UFlat/10 381 388 494.6MB/s xls_200 -1.8% +BM_UFlat/11 339715 331973 440.1MB/s txt1 +2.3% +BM_UFlat/12 294807 289418 415.4MB/s txt2 +1.9% +BM_UFlat/13 906160 884094 463.3MB/s txt3 +2.5% +BM_UFlat/14 1224221 1198435 386.1MB/s txt4 +2.2% +BM_UFlat/15 516277 502923 979.5MB/s bin +2.7% +BM_UFlat/16 405 402 477.2MB/s bin_200 +0.7% +BM_UFlat/17 61640 60621 605.6MB/s sum +1.7% +BM_UFlat/18 7326 7383 549.5MB/s man -0.8% +BM_UFlat/19 94720 92653 1.2GB/s pb +2.2% +BM_UFlat/20 360435 346687 510.6MB/s gaviota +4.0% +Sum of all benchmarks 6944998 6828663 +1.7% + +------------------------------------------------------------------------ +r76 | snappy.mirrorbot@gmail.com | 2013-06-13 18:19:52 +0200 (Thu, 13 Jun 2013) | 9 lines + +Add support for uncompressing to iovecs (scatter I/O). +Windows does not have struct iovec defined anywhere, +so we define our own version that's equal to what UNIX +typically has. + +The bulk of this patch was contributed by Mohit Aron. + +R=jeff + +------------------------------------------------------------------------ +r75 | snappy.mirrorbot@gmail.com | 2013-06-12 21:51:15 +0200 (Wed, 12 Jun 2013) | 4 lines + +Some code reorganization needed for an internal change. + +R=fikes + +------------------------------------------------------------------------ +r74 | snappy.mirrorbot@gmail.com | 2013-04-09 17:33:30 +0200 (Tue, 09 Apr 2013) | 4 lines + +Supports truncated test data in zippy benchmark. + +R=sesse + +------------------------------------------------------------------------ +r73 | snappy.mirrorbot@gmail.com | 2013-02-05 15:36:15 +0100 (Tue, 05 Feb 2013) | 4 lines + +Release Snappy 1.1.0. + +R=sanjay + ------------------------------------------------------------------------ r72 | snappy.mirrorbot@gmail.com | 2013-02-05 15:30:05 +0100 (Tue, 05 Feb 2013) | 9 lines diff --git a/NEWS b/NEWS index ab9bf90..f21e9d0 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,16 @@ +Snappy v1.1.1, October 15th 2013: + + * Add support for uncompressing to iovecs (scatter I/O). + The bulk of this patch was contributed by Mohit Aron. + + * Speed up decompression by ~2%; much more so (~13-20%) on + a few benchmarks on given compilers and CPUs. + + * Fix a few issues with MSVC compilation. + + * Support truncated test data in the benchmark. + + Snappy v1.1.0, January 18th 2013: * Snappy now uses 64 kB block size instead of 32 kB. On average, diff --git a/configure.ac b/configure.ac index 9ee8938..bfd15b0 100644 --- a/configure.ac +++ b/configure.ac @@ -1,11 +1,11 @@ m4_define([snappy_major], [1]) m4_define([snappy_minor], [1]) -m4_define([snappy_patchlevel], [0]) +m4_define([snappy_patchlevel], [1]) # Libtool shared library interface versions (current:revision:age) # Update this value for every release! (A:B:C will map to foo.so.(A-C).C.B) # http://www.gnu.org/software/libtool/manual/html_node/Updating-version-info.html -m4_define([snappy_ltversion], [2:4:1]) +m4_define([snappy_ltversion], [3:0:2]) AC_INIT([snappy], [snappy_major.snappy_minor.snappy_patchlevel]) AC_CONFIG_MACRO_DIR([m4])