From 136b3ebc3101a3459a251e4359efbf67fef246a0 Mon Sep 17 00:00:00 2001 From: atdt Date: Wed, 12 Dec 2018 07:14:02 -0800 Subject: [PATCH] If BMI instructions are available, use BZHI to extract low bytes. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With --cpu=haswell, this results in some significant speed improvement (notably 12-14% for html and pb). On k8, performance is not affected (as expected). Full benchmark results for --cpu={k8,haswell} below. Haswell ------- name old time/op new time/op delta BM_UFlat/0 [html ] 55.2µs ± 0% 49.0µs ± 0% -11.34% (p=0.008 n=5+5) BM_UFlat/1 [urls ] 612µs ± 0% 604µs ± 0% -1.21% (p=0.008 n=5+5) BM_UFlat/2 [jpg ] 6.11µs ± 2% 6.07µs ± 1% ~ (p=0.421 n=5+5) BM_UFlat/3 [jpg_200 ] 134ns ± 0% 132ns ± 5% -1.49% (p=0.048 n=5+5) BM_UFlat/4 [pdf ] 8.41µs ± 2% 8.34µs ± 1% ~ (p=0.222 n=5+5) BM_UFlat/5 [html4 ] 239µs ± 0% 234µs ± 0% -2.24% (p=0.008 n=5+5) BM_UFlat/6 [txt1 ] 211µs ± 0% 205µs ± 0% -2.73% (p=0.008 n=5+5) BM_UFlat/7 [txt2 ] 185µs ± 0% 181µs ± 0% -2.34% (p=0.008 n=5+5) BM_UFlat/8 [txt3 ] 560µs ± 0% 545µs ± 0% -2.55% (p=0.008 n=5+5) BM_UFlat/9 [txt4 ] 773µs ± 0% 753µs ± 0% -2.61% (p=0.008 n=5+5) BM_UFlat/10 [pb ] 51.6µs ± 0% 45.3µs ± 0% -12.28% (p=0.008 n=5+5) BM_UFlat/11 [gaviota ] 209µs ± 0% 204µs ± 0% -2.28% (p=0.008 n=5+5) BM_UFlat/12 [cp ] 17.3µs ± 0% 15.7µs ± 1% -9.57% (p=0.008 n=5+5) BM_UFlat/13 [c ] 8.08µs ± 0% 8.00µs ± 0% -0.99% (p=0.008 n=5+5) BM_UFlat/14 [lsp ] 2.48µs ± 0% 2.45µs ± 0% -1.11% (p=0.008 n=5+5) BM_UFlat/15 [xls ] 967µs ± 0% 954µs ± 0% -1.36% (p=0.008 n=5+5) BM_UFlat/16 [xls_200 ] 219ns ± 1% 218ns ± 1% ~ (p=0.444 n=5+5) BM_UFlat/17 [bin ] 278µs ± 0% 275µs ± 0% -0.92% (p=0.008 n=5+5) BM_UFlat/18 [bin_200 ] 100ns ± 0% 99ns ± 1% -1.04% (p=0.008 n=5+5) BM_UFlat/19 [sum ] 34.0µs ± 0% 30.9µs ± 0% -9.10% (p=0.008 n=5+5) BM_UFlat/20 [man ] 3.21µs ± 0% 3.20µs ± 0% ~ (p=0.063 n=5+5) BM_UValidate/0 [html ] 33.1µs ± 0% 33.6µs ± 0% +1.69% (p=0.008 n=5+5) BM_UValidate/1 [urls ] 436µs ± 0% 441µs ± 0% +1.06% (p=0.008 n=5+5) BM_UValidate/2 [jpg ] 141ns ± 0% 142ns ± 0% +0.71% (p=0.008 n=5+5) BM_UValidate/3 [jpg_200 ] 94.3ns ± 0% 95.3ns ± 0% +1.06% (p=0.008 n=5+5) BM_UValidate/4 [pdf ] 2.87µs ± 0% 2.95µs ± 0% +2.74% (p=0.008 n=5+5) BM_UIOVec/0 [html ] 126µs ± 0% 124µs ± 0% -1.50% (p=0.008 n=5+5) BM_UIOVec/1 [urls ] 1.13ms ± 0% 1.11ms ± 0% -1.95% (p=0.008 n=5+5) BM_UIOVec/2 [jpg ] 6.31µs ± 3% 7.44µs ± 3% +17.75% (p=0.008 n=5+5) BM_UIOVec/3 [jpg_200 ] 332ns ± 1% 318ns ± 1% -4.22% (p=0.008 n=5+5) BM_UIOVec/4 [pdf ] 12.7µs ± 3% 12.6µs ± 9% ~ (p=0.222 n=5+5) BM_UFlatSink/0 [html ] 55.2µs ± 0% 49.0µs ± 0% -11.31% (p=0.008 n=5+5) BM_UFlatSink/1 [urls ] 612µs ± 0% 605µs ± 0% -1.17% (p=0.008 n=5+5) BM_UFlatSink/2 [jpg ] 6.29µs ±12% 6.57µs ± 9% ~ (p=0.548 n=5+5) BM_UFlatSink/3 [jpg_200 ] 138ns ± 2% 134ns ± 0% -2.76% (p=0.000 n=5+4) BM_UFlatSink/4 [pdf ] 8.35µs ± 0% 8.34µs ± 1% ~ (p=0.905 n=4+5) BM_UFlatSink/5 [html4 ] 239µs ± 0% 234µs ± 0% -2.33% (p=0.008 n=5+5) BM_UFlatSink/6 [txt1 ] 211µs ± 0% 205µs ± 0% -2.82% (p=0.008 n=5+5) BM_UFlatSink/7 [txt2 ] 185µs ± 0% 181µs ± 0% -2.18% (p=0.008 n=5+5) BM_UFlatSink/8 [txt3 ] 560µs ± 0% 545µs ± 0% -2.57% (p=0.008 n=5+5) BM_UFlatSink/9 [txt4 ] 773µs ± 0% 754µs ± 0% -2.54% (p=0.008 n=5+5) BM_UFlatSink/10 [pb ] 51.6µs ± 0% 45.3µs ± 0% -12.19% (p=0.008 n=5+5) BM_UFlatSink/11 [gaviota ] 209µs ± 0% 204µs ± 0% -2.39% (p=0.008 n=5+5) BM_UFlatSink/12 [cp ] 17.3µs ± 0% 15.6µs ± 0% -9.98% (p=0.008 n=5+5) BM_UFlatSink/13 [c ] 8.10µs ± 1% 7.98µs ± 0% -1.53% (p=0.008 n=5+5) BM_UFlatSink/14 [lsp ] 2.49µs ± 1% 2.47µs ± 0% -0.84% (p=0.008 n=5+5) BM_UFlatSink/15 [xls ] 968µs ± 0% 953µs ± 0% -1.48% (p=0.008 n=5+5) BM_UFlatSink/16 [xls_200 ] 220ns ± 1% 220ns ± 0% ~ (p=1.000 n=5+4) BM_UFlatSink/17 [bin ] 278µs ± 0% 275µs ± 0% -0.99% (p=0.008 n=5+5) BM_UFlatSink/18 [bin_200 ] 102ns ± 1% 103ns ± 0% +1.18% (p=0.048 n=5+5) BM_UFlatSink/19 [sum ] 34.0µs ± 0% 30.9µs ± 0% -9.21% (p=0.008 n=5+5) BM_UFlatSink/20 [man ] 3.22µs ± 1% 3.20µs ± 0% -0.76% (p=0.032 n=5+5) BM_ZFlat/0 [html (22.31 %) ] 122µs ± 0% 122µs ± 0% ~ (p=0.413 n=4+5) BM_ZFlat/1 [urls (47.78 %) ] 1.60ms ± 0% 1.60ms ± 0% -0.06% (p=0.032 n=5+5) BM_ZFlat/2 [jpg (99.95 %) ] 10.5µs ± 2% 10.7µs ± 9% ~ (p=0.841 n=5+5) BM_ZFlat/3 [jpg_200 (73.00 %)] 310ns ± 1% 309ns ± 3% ~ (p=0.349 n=4+5) BM_ZFlat/4 [pdf (83.30 %) ] 13.5µs ± 1% 13.6µs ± 2% ~ (p=0.595 n=5+5) BM_ZFlat/5 [html4 (22.52 %) ] 533µs ± 0% 532µs ± 0% -0.08% (p=0.032 n=5+5) BM_ZFlat/6 [txt1 (57.88 %) ] 529µs ± 0% 528µs ± 0% ~ (p=0.222 n=5+5) BM_ZFlat/7 [txt2 (61.91 %) ] 469µs ± 0% 469µs ± 0% ~ (p=0.690 n=5+5) BM_ZFlat/8 [txt3 (54.99 %) ] 1.40ms ± 0% 1.40ms ± 0% ~ (p=0.548 n=5+5) BM_ZFlat/9 [txt4 (66.26 %) ] 1.93ms ± 0% 1.92ms ± 0% ~ (p=0.421 n=5+5) BM_ZFlat/10 [pb (19.68 %) ] 106µs ± 0% 106µs ± 0% ~ (p=0.548 n=5+5) BM_ZFlat/11 [gaviota (37.72 %)] 404µs ± 0% 404µs ± 0% ~ (p=0.841 n=5+5) BM_ZFlat/12 [cp (48.12 %) ] 43.2µs ± 0% 43.3µs ± 1% ~ (p=0.151 n=5+5) BM_ZFlat/13 [c (42.47 %) ] 16.4µs ± 1% 16.4µs ± 0% ~ (p=0.310 n=5+5) BM_ZFlat/14 [lsp (48.37 %) ] 4.96µs ± 0% 4.96µs ± 1% ~ (p=0.651 n=5+5) BM_ZFlat/15 [xls (41.23 %) ] 1.54ms ± 0% 1.54ms ± 0% ~ (p=0.841 n=5+5) BM_ZFlat/16 [xls_200 (78.00 %)] 352ns ± 2% 351ns ± 1% ~ (p=0.762 n=5+5) BM_ZFlat/17 [bin (18.11 %) ] 491µs ± 0% 491µs ± 0% ~ (p=0.310 n=5+5) BM_ZFlat/18 [bin_200 (7.50 %) ] 75.6ns ± 1% 77.2ns ± 0% +2.06% (p=0.016 n=5+4) BM_ZFlat/19 [sum (48.96 %) ] 76.9µs ± 0% 76.7µs ± 0% ~ (p=0.222 n=5+5) BM_ZFlat/20 [man (59.21 %) ] 6.87µs ± 1% 6.81µs ± 0% -0.87% (p=0.008 n=5+5) name old speed new speed delta BM_UFlat/0 [html ] 1.85GB/s ± 0% 2.09GB/s ± 0% +12.83% (p=0.016 n=4+5) BM_UFlat/1 [urls ] 1.15GB/s ± 0% 1.16GB/s ± 0% +1.25% (p=0.008 n=5+5) BM_UFlat/2 [jpg ] 20.1GB/s ± 2% 20.3GB/s ± 1% ~ (p=0.421 n=5+5) BM_UFlat/3 [jpg_200 ] 1.49GB/s ± 0% 1.53GB/s ± 0% +2.83% (p=0.016 n=5+4) BM_UFlat/4 [pdf ] 12.2GB/s ± 2% 12.3GB/s ± 1% ~ (p=0.222 n=5+5) BM_UFlat/5 [html4 ] 1.71GB/s ± 0% 1.75GB/s ± 0% +2.29% (p=0.008 n=5+5) BM_UFlat/6 [txt1 ] 722MB/s ± 0% 742MB/s ± 0% +2.81% (p=0.008 n=5+5) BM_UFlat/7 [txt2 ] 676MB/s ± 0% 692MB/s ± 0% +2.40% (p=0.008 n=5+5) BM_UFlat/8 [txt3 ] 762MB/s ± 0% 782MB/s ± 0% +2.62% (p=0.008 n=5+5) BM_UFlat/9 [txt4 ] 623MB/s ± 0% 640MB/s ± 0% +2.68% (p=0.008 n=5+5) BM_UFlat/10 [pb ] 2.30GB/s ± 0% 2.62GB/s ± 0% +13.99% (p=0.008 n=5+5) BM_UFlat/11 [gaviota ] 883MB/s ± 0% 903MB/s ± 0% +2.33% (p=0.008 n=5+5) BM_UFlat/12 [cp ] 1.42GB/s ± 0% 1.57GB/s ± 1% +10.57% (p=0.008 n=5+5) BM_UFlat/13 [c ] 1.38GB/s ± 0% 1.39GB/s ± 0% +1.00% (p=0.008 n=5+5) BM_UFlat/14 [lsp ] 1.50GB/s ± 0% 1.52GB/s ± 0% +1.12% (p=0.008 n=5+5) BM_UFlat/15 [xls ] 1.06GB/s ± 0% 1.08GB/s ± 0% +1.34% (p=0.016 n=5+4) BM_UFlat/16 [xls_200 ] 913MB/s ± 1% 918MB/s ± 1% ~ (p=0.421 n=5+5) BM_UFlat/17 [bin ] 1.85GB/s ± 0% 1.86GB/s ± 0% +0.92% (p=0.008 n=5+5) BM_UFlat/18 [bin_200 ] 2.01GB/s ± 0% 2.03GB/s ± 1% +1.10% (p=0.008 n=5+5) BM_UFlat/19 [sum ] 1.13GB/s ± 0% 1.24GB/s ± 0% +9.99% (p=0.008 n=5+5) BM_UFlat/20 [man ] 1.32GB/s ± 0% 1.32GB/s ± 1% ~ (p=0.063 n=5+5) BM_UValidate/0 [html ] 3.10GB/s ± 0% 3.04GB/s ± 0% -1.66% (p=0.008 n=5+5) BM_UValidate/1 [urls ] 1.61GB/s ± 0% 1.59GB/s ± 0% -1.04% (p=0.008 n=5+5) BM_UValidate/2 [jpg ] 875GB/s ± 0% 866GB/s ± 0% -1.11% (p=0.008 n=5+5) BM_UValidate/3 [jpg_200 ] 2.12GB/s ± 0% 2.10GB/s ± 0% -1.01% (p=0.016 n=5+4) BM_UValidate/4 [pdf ] 35.7GB/s ± 0% 34.7GB/s ± 0% -2.66% (p=0.008 n=5+5) BM_UIOVec/0 [html ] 813MB/s ± 0% 825MB/s ± 0% +1.52% (p=0.008 n=5+5) BM_UIOVec/1 [urls ] 622MB/s ± 0% 634MB/s ± 0% +1.99% (p=0.008 n=5+5) BM_UIOVec/2 [jpg ] 19.5GB/s ± 3% 16.6GB/s ± 3% -15.08% (p=0.008 n=5+5) BM_UIOVec/3 [jpg_200 ] 603MB/s ± 1% 630MB/s ± 1% +4.42% (p=0.008 n=5+5) BM_UIOVec/4 [pdf ] 8.05GB/s ± 3% 8.12GB/s ± 8% ~ (p=0.222 n=5+5) BM_UFlatSink/0 [html ] 1.85GB/s ± 0% 2.09GB/s ± 0% +12.76% (p=0.008 n=5+5) BM_UFlatSink/1 [urls ] 1.15GB/s ± 0% 1.16GB/s ± 0% +1.18% (p=0.008 n=5+5) BM_UFlatSink/2 [jpg ] 19.6GB/s ±11% 18.8GB/s ± 9% ~ (p=0.548 n=5+5) BM_UFlatSink/3 [jpg_200 ] 1.45GB/s ± 1% 1.49GB/s ± 0% +2.82% (p=0.016 n=5+4) BM_UFlatSink/4 [pdf ] 12.3GB/s ± 0% 12.3GB/s ± 1% ~ (p=0.905 n=4+5) BM_UFlatSink/5 [html4 ] 1.71GB/s ± 0% 1.75GB/s ± 0% +2.41% (p=0.008 n=5+5) BM_UFlatSink/6 [txt1 ] 722MB/s ± 0% 743MB/s ± 0% +2.90% (p=0.008 n=5+5) BM_UFlatSink/7 [txt2 ] 676MB/s ± 0% 691MB/s ± 0% +2.23% (p=0.008 n=5+5) BM_UFlatSink/8 [txt3 ] 763MB/s ± 0% 783MB/s ± 0% +2.64% (p=0.008 n=5+5) BM_UFlatSink/9 [txt4 ] 623MB/s ± 0% 639MB/s ± 0% +2.61% (p=0.008 n=5+5) BM_UFlatSink/10 [pb ] 2.30GB/s ± 0% 2.62GB/s ± 0% +13.86% (p=0.008 n=5+5) BM_UFlatSink/11 [gaviota ] 882MB/s ± 0% 904MB/s ± 0% +2.45% (p=0.008 n=5+5) BM_UFlatSink/12 [cp ] 1.42GB/s ± 0% 1.58GB/s ± 0% +11.09% (p=0.008 n=5+5) BM_UFlatSink/13 [c ] 1.38GB/s ± 1% 1.40GB/s ± 0% +1.56% (p=0.008 n=5+5) BM_UFlatSink/14 [lsp ] 1.50GB/s ± 1% 1.51GB/s ± 1% +0.85% (p=0.008 n=5+5) BM_UFlatSink/15 [xls ] 1.06GB/s ± 0% 1.08GB/s ± 0% +1.51% (p=0.016 n=5+4) BM_UFlatSink/16 [xls_200 ] 908MB/s ± 1% 911MB/s ± 0% ~ (p=0.730 n=5+4) BM_UFlatSink/17 [bin ] 1.85GB/s ± 0% 1.86GB/s ± 0% +1.01% (p=0.008 n=5+5) BM_UFlatSink/18 [bin_200 ] 1.96GB/s ± 1% 1.94GB/s ± 1% -1.18% (p=0.016 n=5+5) BM_UFlatSink/19 [sum ] 1.12GB/s ± 0% 1.24GB/s ± 0% +10.16% (p=0.008 n=5+5) BM_UFlatSink/20 [man ] 1.31GB/s ± 1% 1.32GB/s ± 0% +0.77% (p=0.048 n=5+5) BM_ZFlat/0 [html (22.31 %) ] 839MB/s ± 0% 839MB/s ± 0% ~ (p=0.413 n=4+5) BM_ZFlat/1 [urls (47.78 %) ] 439MB/s ± 0% 439MB/s ± 0% +0.06% (p=0.032 n=5+5) BM_ZFlat/2 [jpg (99.95 %) ] 11.7GB/s ± 2% 11.5GB/s ± 9% ~ (p=0.841 n=5+5) BM_ZFlat/3 [jpg_200 (73.00 %)] 645MB/s ± 1% 647MB/s ± 3% ~ (p=0.413 n=4+5) BM_ZFlat/4 [pdf (83.30 %) ] 7.57GB/s ± 1% 7.54GB/s ± 2% ~ (p=0.595 n=5+5) BM_ZFlat/5 [html4 (22.52 %) ] 769MB/s ± 0% 770MB/s ± 0% +0.08% (p=0.032 n=5+5) BM_ZFlat/6 [txt1 (57.88 %) ] 288MB/s ± 0% 288MB/s ± 0% ~ (p=0.222 n=5+5) BM_ZFlat/7 [txt2 (61.91 %) ] 267MB/s ± 0% 267MB/s ± 0% ~ (p=0.690 n=5+5) BM_ZFlat/8 [txt3 (54.99 %) ] 305MB/s ± 0% 305MB/s ± 0% ~ (p=0.548 n=5+5) BM_ZFlat/9 [txt4 (66.26 %) ] 250MB/s ± 0% 251MB/s ± 0% ~ (p=0.421 n=5+5) BM_ZFlat/10 [pb (19.68 %) ] 1.12GB/s ± 0% 1.12GB/s ± 0% ~ (p=0.635 n=5+5) BM_ZFlat/11 [gaviota (37.72 %)] 457MB/s ± 0% 457MB/s ± 0% ~ (p=0.841 n=5+5) BM_ZFlat/12 [cp (48.12 %) ] 570MB/s ± 0% 568MB/s ± 1% ~ (p=0.151 n=5+5) BM_ZFlat/13 [c (42.47 %) ] 682MB/s ± 1% 681MB/s ± 0% ~ (p=0.310 n=5+5) BM_ZFlat/14 [lsp (48.37 %) ] 750MB/s ± 0% 751MB/s ± 1% ~ (p=0.690 n=5+5) BM_ZFlat/15 [xls (41.23 %) ] 668MB/s ± 0% 668MB/s ± 0% ~ (p=0.841 n=5+5) BM_ZFlat/16 [xls_200 (78.00 %)] 569MB/s ± 2% 570MB/s ± 1% ~ (p=0.841 n=5+5) BM_ZFlat/17 [bin (18.11 %) ] 1.04GB/s ± 0% 1.04GB/s ± 0% ~ (p=0.310 n=5+5) BM_ZFlat/18 [bin_200 (7.50 %) ] 2.64GB/s ± 1% 2.59GB/s ± 0% -1.99% (p=0.016 n=5+4) BM_ZFlat/19 [sum (48.96 %) ] 497MB/s ± 0% 498MB/s ± 0% ~ (p=0.222 n=5+5) BM_ZFlat/20 [man (59.21 %) ] 615MB/s ± 1% 621MB/s ± 0% +0.87% (p=0.008 n=5+5) K8 -- name old time/op new time/op delta BM_UFlat/0 [html ] 41.7µs ± 0% 41.7µs ± 0% ~ (p=0.841 n=5+5) BM_UFlat/1 [urls ] 588µs ± 0% 588µs ± 0% ~ (p=0.310 n=5+5) BM_UFlat/2 [jpg ] 7.11µs ± 1% 7.10µs ± 1% ~ (p=0.556 n=5+4) BM_UFlat/3 [jpg_200 ] 130ns ± 0% 130ns ± 0% ~ (all samples are equal) BM_UFlat/4 [pdf ] 8.19µs ± 0% 8.26µs ± 2% ~ (p=0.460 n=5+5) BM_UFlat/5 [html4 ] 219µs ± 0% 219µs ± 0% ~ (p=1.000 n=5+5) BM_UFlat/6 [txt1 ] 192µs ± 0% 191µs ± 0% ~ (p=0.341 n=5+5) BM_UFlat/7 [txt2 ] 170µs ± 0% 170µs ± 0% ~ (p=0.841 n=5+5) BM_UFlat/8 [txt3 ] 509µs ± 0% 509µs ± 0% ~ (p=0.151 n=5+5) BM_UFlat/9 [txt4 ] 712µs ± 0% 712µs ± 0% ~ (p=0.841 n=5+5) BM_UFlat/10 [pb ] 38.5µs ± 0% 38.5µs ± 0% ~ (p=0.452 n=5+5) BM_UFlat/11 [gaviota ] 189µs ± 0% 189µs ± 0% ~ (p=0.841 n=5+5) BM_UFlat/12 [cp ] 14.2µs ± 1% 14.2µs ± 0% ~ (p=0.889 n=5+5) BM_UFlat/13 [c ] 7.32µs ± 0% 7.33µs ± 0% ~ (p=1.000 n=5+5) BM_UFlat/14 [lsp ] 2.26µs ± 0% 2.27µs ± 0% ~ (p=0.222 n=4+5) BM_UFlat/15 [xls ] 954µs ± 0% 955µs ± 0% ~ (p=0.222 n=5+5) BM_UFlat/16 [xls_200 ] 215ns ± 4% 212ns ± 0% ~ (p=0.095 n=5+4) BM_UFlat/17 [bin ] 276µs ± 0% 276µs ± 0% ~ (p=0.841 n=5+5) BM_UFlat/18 [bin_200 ] 104ns ±10% 103ns ± 3% ~ (p=0.825 n=5+5) BM_UFlat/19 [sum ] 29.2µs ± 0% 29.2µs ± 0% ~ (p=0.690 n=5+5) BM_UFlat/20 [man ] 2.96µs ± 0% 2.97µs ± 0% +0.43% (p=0.032 n=5+5) BM_UValidate/0 [html ] 33.4µs ± 0% 33.4µs ± 0% ~ (p=0.151 n=5+5) BM_UValidate/1 [urls ] 441µs ± 0% 441µs ± 0% ~ (p=0.548 n=5+5) BM_UValidate/2 [jpg ] 146ns ± 0% 146ns ± 0% ~ (all samples are equal) BM_UValidate/3 [jpg_200 ] 98.0ns ± 0% 98.0ns ± 0% ~ (p=1.000 n=5+5) BM_UValidate/4 [pdf ] 2.89µs ± 0% 2.89µs ± 0% ~ (p=0.794 n=5+5) BM_UIOVec/0 [html ] 121µs ± 0% 121µs ± 0% ~ (p=0.151 n=5+5) BM_UIOVec/1 [urls ] 1.08ms ± 0% 1.08ms ± 0% ~ (p=0.095 n=5+5) BM_UIOVec/2 [jpg ] 7.47µs ± 5% 7.31µs ± 2% ~ (p=0.222 n=5+5) BM_UIOVec/3 [jpg_200 ] 330ns ± 0% 330ns ± 0% ~ (all samples are equal) BM_UIOVec/4 [pdf ] 12.3µs ± 2% 12.0µs ± 0% ~ (p=0.063 n=5+5) BM_UFlatSink/0 [html ] 41.6µs ± 0% 41.6µs ± 0% ~ (p=0.095 n=5+5) BM_UFlatSink/1 [urls ] 589µs ± 0% 589µs ± 0% ~ (p=1.000 n=5+5) BM_UFlatSink/2 [jpg ] 7.84µs ±26% 7.23µs ± 5% ~ (p=0.690 n=5+5) BM_UFlatSink/3 [jpg_200 ] 132ns ± 0% 132ns ± 0% ~ (all samples are equal) BM_UFlatSink/4 [pdf ] 8.43µs ± 3% 8.27µs ± 2% ~ (p=0.254 n=5+5) BM_UFlatSink/5 [html4 ] 219µs ± 0% 219µs ± 0% ~ (p=0.524 n=5+5) BM_UFlatSink/6 [txt1 ] 192µs ± 0% 192µs ± 0% ~ (p=0.690 n=5+5) BM_UFlatSink/7 [txt2 ] 170µs ± 0% 170µs ± 0% ~ (p=0.421 n=5+5) BM_UFlatSink/8 [txt3 ] 509µs ± 0% 509µs ± 0% ~ (p=0.310 n=5+5) BM_UFlatSink/9 [txt4 ] 712µs ± 0% 712µs ± 0% ~ (p=0.841 n=5+5) BM_UFlatSink/10 [pb ] 38.5µs ± 0% 38.5µs ± 0% ~ (p=0.421 n=5+5) BM_UFlatSink/11 [gaviota ] 189µs ± 0% 189µs ± 0% ~ (p=1.000 n=5+5) BM_UFlatSink/12 [cp ] 14.2µs ± 0% 14.2µs ± 0% ~ (p=0.421 n=5+5) BM_UFlatSink/13 [c ] 7.37µs ± 1% 7.36µs ± 1% ~ (p=0.746 n=5+5) BM_UFlatSink/14 [lsp ] 2.27µs ± 0% 2.27µs ± 1% ~ (p=0.714 n=5+5) BM_UFlatSink/15 [xls ] 954µs ± 0% 954µs ± 0% ~ (p=1.000 n=5+5) BM_UFlatSink/16 [xls_200 ] 215ns ± 1% 215ns ± 1% ~ (p=0.921 n=5+5) BM_UFlatSink/17 [bin ] 276µs ± 0% 276µs ± 0% ~ (p=1.000 n=5+5) BM_UFlatSink/18 [bin_200 ] 103ns ± 2% 104ns ± 1% ~ (p=0.429 n=5+5) BM_UFlatSink/19 [sum ] 29.2µs ± 0% 29.2µs ± 0% ~ (p=0.452 n=5+5) BM_UFlatSink/20 [man ] 2.96µs ± 0% 2.97µs ± 1% ~ (p=0.484 n=5+5) BM_ZFlat/0 [html (22.31 %) ] 126µs ± 0% 126µs ± 0% ~ (p=1.000 n=5+5) BM_ZFlat/1 [urls (47.78 %) ] 1.67ms ± 0% 1.67ms ± 0% ~ (p=0.841 n=5+5) BM_ZFlat/2 [jpg (99.95 %) ] 11.6µs ± 4% 11.6µs ± 3% ~ (p=1.000 n=5+5) BM_ZFlat/3 [jpg_200 (73.00 %)] 368ns ± 1% 367ns ± 0% ~ (p=0.159 n=5+5) BM_ZFlat/4 [pdf (83.30 %) ] 14.7µs ± 1% 14.6µs ± 0% ~ (p=0.190 n=5+4) BM_ZFlat/5 [html4 (22.52 %) ] 550µs ± 0% 550µs ± 0% ~ (p=0.841 n=5+5) BM_ZFlat/6 [txt1 (57.88 %) ] 540µs ± 0% 540µs ± 0% ~ (p=0.310 n=5+5) BM_ZFlat/7 [txt2 (61.91 %) ] 479µs ± 0% 480µs ± 0% ~ (p=1.000 n=5+5) BM_ZFlat/8 [txt3 (54.99 %) ] 1.44ms ± 0% 1.44ms ± 0% ~ (p=0.421 n=5+5) BM_ZFlat/9 [txt4 (66.26 %) ] 1.97ms ± 0% 1.97ms ± 0% ~ (p=0.421 n=5+5) BM_ZFlat/10 [pb (19.68 %) ] 110µs ± 0% 109µs ± 0% ~ (p=0.730 n=5+4) BM_ZFlat/11 [gaviota (37.72 %)] 412µs ± 0% 412µs ± 0% ~ (p=1.000 n=5+5) BM_ZFlat/12 [cp (48.12 %) ] 46.3µs ± 0% 46.3µs ± 1% ~ (p=0.841 n=5+5) BM_ZFlat/13 [c (42.47 %) ] 17.7µs ± 0% 17.7µs ± 1% ~ (p=0.841 n=5+5) BM_ZFlat/14 [lsp (48.37 %) ] 5.54µs ± 1% 5.55µs ± 0% ~ (p=0.254 n=5+4) BM_ZFlat/15 [xls (41.23 %) ] 1.62ms ± 0% 1.63ms ± 0% ~ (p=0.151 n=5+5) BM_ZFlat/16 [xls_200 (78.00 %)] 395ns ± 2% 394ns ± 1% ~ (p=1.000 n=5+5) BM_ZFlat/17 [bin (18.11 %) ] 507µs ± 0% 507µs ± 0% ~ (p=0.056 n=5+5) BM_ZFlat/18 [bin_200 (7.50 %) ] 89.6ns ± 5% 89.8ns ± 5% ~ (p=1.000 n=5+5) BM_ZFlat/19 [sum (48.96 %) ] 79.9µs ± 0% 79.9µs ± 0% ~ (p=0.690 n=5+5) BM_ZFlat/20 [man (59.21 %) ] 7.67µs ± 0% 7.67µs ± 1% ~ (p=0.548 n=5+5) name old speed new speed delta BM_UFlat/0 [html ] 2.45GB/s ± 0% 2.45GB/s ± 0% ~ (p=0.889 n=5+5) BM_UFlat/1 [urls ] 1.19GB/s ± 0% 1.19GB/s ± 0% ~ (all samples are equal) BM_UFlat/2 [jpg ] 17.3GB/s ± 1% 17.3GB/s ± 1% ~ (p=0.556 n=5+4) BM_UFlat/3 [jpg_200 ] 1.54GB/s ± 0% 1.54GB/s ± 0% ~ (p=0.833 n=5+5) BM_UFlat/4 [pdf ] 12.5GB/s ± 0% 12.4GB/s ± 2% ~ (p=0.421 n=5+5) BM_UFlat/5 [html4 ] 1.87GB/s ± 0% 1.87GB/s ± 0% ~ (p=1.000 n=4+5) BM_UFlat/6 [txt1 ] 794MB/s ± 0% 794MB/s ± 0% ~ (p=0.310 n=5+5) BM_UFlat/7 [txt2 ] 738MB/s ± 0% 738MB/s ± 0% ~ (p=0.841 n=5+5) BM_UFlat/8 [txt3 ] 839MB/s ± 0% 838MB/s ± 0% ~ (p=0.151 n=5+5) BM_UFlat/9 [txt4 ] 677MB/s ± 0% 677MB/s ± 0% ~ (p=0.841 n=5+5) BM_UFlat/10 [pb ] 3.08GB/s ± 0% 3.08GB/s ± 0% ~ (p=0.452 n=5+5) BM_UFlat/11 [gaviota ] 975MB/s ± 0% 975MB/s ± 0% ~ (p=0.841 n=5+5) BM_UFlat/12 [cp ] 1.73GB/s ± 1% 1.73GB/s ± 0% ~ (p=0.984 n=5+5) BM_UFlat/13 [c ] 1.52GB/s ± 0% 1.52GB/s ± 0% ~ (p=0.841 n=5+5) BM_UFlat/14 [lsp ] 1.64GB/s ± 0% 1.64GB/s ± 0% ~ (p=0.254 n=4+5) BM_UFlat/15 [xls ] 1.08GB/s ± 0% 1.08GB/s ± 0% ~ (p=0.095 n=5+4) BM_UFlat/16 [xls_200 ] 931MB/s ± 4% 941MB/s ± 0% ~ (p=0.151 n=5+5) BM_UFlat/17 [bin ] 1.86GB/s ± 0% 1.86GB/s ± 0% ~ (p=0.762 n=5+5) BM_UFlat/18 [bin_200 ] 1.92GB/s ± 9% 1.95GB/s ± 3% ~ (p=1.000 n=5+5) BM_UFlat/19 [sum ] 1.31GB/s ± 1% 1.31GB/s ± 0% ~ (p=0.548 n=5+5) BM_UFlat/20 [man ] 1.43GB/s ± 0% 1.42GB/s ± 1% -0.42% (p=0.040 n=5+5) BM_UValidate/0 [html ] 3.06GB/s ± 0% 3.06GB/s ± 0% ~ (p=0.151 n=5+5) BM_UValidate/1 [urls ] 1.59GB/s ± 0% 1.59GB/s ± 0% ~ (p=0.357 n=5+5) BM_UValidate/2 [jpg ] 845GB/s ± 0% 845GB/s ± 0% ~ (p=0.548 n=5+5) BM_UValidate/3 [jpg_200 ] 2.04GB/s ± 0% 2.04GB/s ± 0% ~ (p=1.000 n=5+5) BM_UValidate/4 [pdf ] 35.4GB/s ± 0% 35.4GB/s ± 0% ~ (p=0.421 n=5+5) BM_UIOVec/0 [html ] 845MB/s ± 0% 845MB/s ± 0% ~ (p=0.151 n=5+5) BM_UIOVec/1 [urls ] 650MB/s ± 0% 650MB/s ± 0% ~ (p=0.087 n=5+5) BM_UIOVec/2 [jpg ] 16.5GB/s ± 5% 16.8GB/s ± 2% ~ (p=0.222 n=5+5) BM_UIOVec/3 [jpg_200 ] 605MB/s ± 0% 605MB/s ± 0% ~ (p=0.690 n=5+5) BM_UIOVec/4 [pdf ] 8.36GB/s ± 2% 8.54GB/s ± 0% ~ (p=0.063 n=5+5) BM_UFlatSink/0 [html ] 2.46GB/s ± 0% 2.46GB/s ± 0% ~ (p=0.063 n=5+5) BM_UFlatSink/1 [urls ] 1.19GB/s ± 0% 1.19GB/s ± 0% ~ (all samples are equal) BM_UFlatSink/2 [jpg ] 16.0GB/s ±22% 17.0GB/s ± 5% ~ (p=0.690 n=5+5) BM_UFlatSink/3 [jpg_200 ] 1.51GB/s ± 0% 1.51GB/s ± 2% ~ (p=1.000 n=5+5) BM_UFlatSink/4 [pdf ] 12.2GB/s ± 3% 12.4GB/s ± 2% ~ (p=0.254 n=5+5) BM_UFlatSink/5 [html4 ] 1.87GB/s ± 0% 1.87GB/s ± 0% ~ (p=0.532 n=5+5) BM_UFlatSink/6 [txt1 ] 794MB/s ± 0% 794MB/s ± 0% ~ (p=0.690 n=5+5) BM_UFlatSink/7 [txt2 ] 738MB/s ± 0% 738MB/s ± 0% ~ (p=0.421 n=5+5) BM_UFlatSink/8 [txt3 ] 838MB/s ± 0% 838MB/s ± 0% ~ (p=0.310 n=5+5) BM_UFlatSink/9 [txt4 ] 676MB/s ± 0% 676MB/s ± 0% ~ (p=0.841 n=5+5) BM_UFlatSink/10 [pb ] 3.08GB/s ± 0% 3.08GB/s ± 0% ~ (p=0.365 n=5+5) BM_UFlatSink/11 [gaviota ] 975MB/s ± 0% 975MB/s ± 0% ~ (p=1.000 n=5+5) BM_UFlatSink/12 [cp ] 1.73GB/s ± 0% 1.74GB/s ± 0% ~ (p=0.286 n=5+5) BM_UFlatSink/13 [c ] 1.51GB/s ± 1% 1.52GB/s ± 1% ~ (p=0.683 n=5+5) BM_UFlatSink/14 [lsp ] 1.64GB/s ± 0% 1.64GB/s ± 0% ~ (p=0.444 n=5+5) BM_UFlatSink/15 [xls ] 1.08GB/s ± 0% 1.08GB/s ± 0% ~ (p=0.333 n=4+5) BM_UFlatSink/16 [xls_200 ] 930MB/s ± 1% 930MB/s ± 1% ~ (p=0.841 n=5+5) BM_UFlatSink/17 [bin ] 1.86GB/s ± 0% 1.86GB/s ± 0% ~ (p=1.000 n=5+5) BM_UFlatSink/18 [bin_200 ] 1.93GB/s ± 2% 1.93GB/s ± 1% ~ (p=0.651 n=5+5) BM_UFlatSink/19 [sum ] 1.31GB/s ± 0% 1.31GB/s ± 0% ~ (p=0.508 n=5+5) BM_UFlatSink/20 [man ] 1.43GB/s ± 0% 1.42GB/s ± 1% ~ (p=0.524 n=5+5) BM_ZFlat/0 [html (22.31 %) ] 815MB/s ± 0% 815MB/s ± 0% ~ (p=1.000 n=5+5) BM_ZFlat/1 [urls (47.78 %) ] 420MB/s ± 0% 420MB/s ± 0% ~ (p=0.841 n=5+5) BM_ZFlat/2 [jpg (99.95 %) ] 10.6GB/s ± 4% 10.6GB/s ± 3% ~ (p=1.000 n=5+5) BM_ZFlat/3 [jpg_200 (73.00 %)] 543MB/s ± 1% 546MB/s ± 0% ~ (p=0.095 n=5+5) BM_ZFlat/4 [pdf (83.30 %) ] 6.96GB/s ± 1% 7.01GB/s ± 0% ~ (p=0.190 n=5+4) BM_ZFlat/5 [html4 (22.52 %) ] 745MB/s ± 0% 745MB/s ± 0% ~ (p=0.841 n=5+5) BM_ZFlat/6 [txt1 (57.88 %) ] 282MB/s ± 0% 282MB/s ± 0% ~ (p=0.310 n=5+5) BM_ZFlat/7 [txt2 (61.91 %) ] 261MB/s ± 0% 261MB/s ± 0% ~ (p=1.000 n=5+5) BM_ZFlat/8 [txt3 (54.99 %) ] 297MB/s ± 0% 297MB/s ± 0% ~ (p=0.421 n=5+5) BM_ZFlat/9 [txt4 (66.26 %) ] 244MB/s ± 0% 244MB/s ± 0% ~ (p=0.389 n=5+5) BM_ZFlat/10 [pb (19.68 %) ] 1.08GB/s ± 0% 1.08GB/s ± 0% ~ (p=0.238 n=5+4) BM_ZFlat/11 [gaviota (37.72 %)] 448MB/s ± 0% 447MB/s ± 0% ~ (p=1.000 n=5+5) BM_ZFlat/12 [cp (48.12 %) ] 532MB/s ± 0% 531MB/s ± 1% ~ (p=0.841 n=5+5) BM_ZFlat/13 [c (42.47 %) ] 632MB/s ± 0% 631MB/s ± 1% ~ (p=0.841 n=5+5) BM_ZFlat/14 [lsp (48.37 %) ] 672MB/s ± 1% 671MB/s ± 0% ~ (p=0.286 n=5+4) BM_ZFlat/15 [xls (41.23 %) ] 634MB/s ± 0% 633MB/s ± 0% ~ (p=0.151 n=5+5) BM_ZFlat/16 [xls_200 (78.00 %)] 507MB/s ± 2% 508MB/s ± 1% ~ (p=1.000 n=5+5) BM_ZFlat/17 [bin (18.11 %) ] 1.01GB/s ± 0% 1.01GB/s ± 0% ~ (p=0.056 n=5+5) BM_ZFlat/18 [bin_200 (7.50 %) ] 2.24GB/s ± 5% 2.23GB/s ± 5% ~ (p=0.889 n=5+5) BM_ZFlat/19 [sum (48.96 %) ] 479MB/s ± 0% 479MB/s ± 0% ~ (p=0.690 n=5+5) BM_ZFlat/20 [man (59.21 %) ] 551MB/s ± 0% 551MB/s ± 1% ~ (p=0.548 n=5+5) --- snappy.cc | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/snappy.cc b/snappy.cc index c8ccad8..26bef54 100644 --- a/snappy.cc +++ b/snappy.cc @@ -42,9 +42,7 @@ #endif // !defined(SNAPPY_HAVE_SSSE3) #if SNAPPY_HAVE_SSSE3 -// Please do not replace with . or with headers that assume more -// advanced SSE versions without checking with all the OWNERS. -#include +#include #endif #include @@ -694,14 +692,17 @@ static inline void Report(const char *algorithm, size_t compressed_size, // bool TryFastAppend(const char* ip, size_t available, size_t length); // }; -// Mapping from n in range [0,4] to a mask to extract the bottom 8*n bits. -static inline uint32 WordMask(int n) { +static inline uint32 ExtractLowBytes(uint32 v, int n) { DCHECK_GE(n, 0); DCHECK_LE(n, 4); +#ifdef __BMI2__ + return _bzhi_u32(v, 8 * n); +#else // This needs to be wider than uint32 otherwise `mask << 32` will be // undefined. uint64 mask = 0xffffffff; - return ~(mask << (8 * n)); + return v & ~(mask << (8 * n)); +#endif } // Helper class for decompression @@ -831,7 +832,8 @@ class SnappyDecompressor { // Long literal. const size_t literal_length_length = literal_length - 60; literal_length = - (LittleEndian::Load32(ip) & WordMask(literal_length_length)) + 1; + ExtractLowBytes(LittleEndian::Load32(ip), literal_length_length) + + 1; ip += literal_length_length; } @@ -854,7 +856,8 @@ class SnappyDecompressor { MAYBE_REFILL(); } else { const size_t entry = char_table[c]; - const size_t trailer = LittleEndian::Load32(ip) & WordMask(entry >> 11); + const size_t trailer = + ExtractLowBytes(LittleEndian::Load32(ip), entry >> 11); const size_t length = entry & 0xff; ip += entry >> 11;