If BMI instructions are available, use BZHI to extract low bytes.

With --cpu=haswell, this results in some significant speed improvement
(notably 12-14% for html and pb). On k8, performance is not affected (as
expected). Full benchmark results for --cpu={k8,haswell} below.

Haswell
-------

name                                          old time/op             new time/op             delta
BM_UFlat/0      [html             ]            55.2µs ± 0%             49.0µs ± 0%  -11.34%          (p=0.008 n=5+5)
BM_UFlat/1      [urls             ]             612µs ± 0%              604µs ± 0%   -1.21%          (p=0.008 n=5+5)
BM_UFlat/2      [jpg              ]            6.11µs ± 2%             6.07µs ± 1%     ~             (p=0.421 n=5+5)
BM_UFlat/3      [jpg_200          ]              134ns ± 0%              132ns ± 5%   -1.49%          (p=0.048 n=5+5)
BM_UFlat/4      [pdf              ]            8.41µs ± 2%             8.34µs ± 1%     ~             (p=0.222 n=5+5)
BM_UFlat/5      [html4            ]             239µs ± 0%              234µs ± 0%   -2.24%          (p=0.008 n=5+5)
BM_UFlat/6      [txt1             ]             211µs ± 0%              205µs ± 0%   -2.73%          (p=0.008 n=5+5)
BM_UFlat/7      [txt2             ]             185µs ± 0%              181µs ± 0%   -2.34%          (p=0.008 n=5+5)
BM_UFlat/8      [txt3             ]             560µs ± 0%              545µs ± 0%   -2.55%          (p=0.008 n=5+5)
BM_UFlat/9      [txt4             ]             773µs ± 0%              753µs ± 0%   -2.61%          (p=0.008 n=5+5)
BM_UFlat/10     [pb               ]            51.6µs ± 0%             45.3µs ± 0%  -12.28%          (p=0.008 n=5+5)
BM_UFlat/11     [gaviota          ]             209µs ± 0%              204µs ± 0%   -2.28%          (p=0.008 n=5+5)
BM_UFlat/12     [cp               ]            17.3µs ± 0%             15.7µs ± 1%   -9.57%          (p=0.008 n=5+5)
BM_UFlat/13     [c                ]            8.08µs ± 0%             8.00µs ± 0%   -0.99%          (p=0.008 n=5+5)
BM_UFlat/14     [lsp              ]            2.48µs ± 0%             2.45µs ± 0%   -1.11%          (p=0.008 n=5+5)
BM_UFlat/15     [xls              ]             967µs ± 0%              954µs ± 0%   -1.36%          (p=0.008 n=5+5)
BM_UFlat/16     [xls_200          ]              219ns ± 1%              218ns ± 1%     ~             (p=0.444 n=5+5)
BM_UFlat/17     [bin              ]             278µs ± 0%              275µs ± 0%   -0.92%          (p=0.008 n=5+5)
BM_UFlat/18     [bin_200          ]              100ns ± 0%               99ns ± 1%   -1.04%          (p=0.008 n=5+5)
BM_UFlat/19     [sum              ]            34.0µs ± 0%             30.9µs ± 0%   -9.10%          (p=0.008 n=5+5)
BM_UFlat/20     [man              ]            3.21µs ± 0%             3.20µs ± 0%     ~             (p=0.063 n=5+5)
BM_UValidate/0  [html             ]            33.1µs ± 0%             33.6µs ± 0%   +1.69%          (p=0.008 n=5+5)
BM_UValidate/1  [urls             ]             436µs ± 0%              441µs ± 0%   +1.06%          (p=0.008 n=5+5)
BM_UValidate/2  [jpg              ]              141ns ± 0%              142ns ± 0%   +0.71%          (p=0.008 n=5+5)
BM_UValidate/3  [jpg_200          ]             94.3ns ± 0%             95.3ns ± 0%   +1.06%          (p=0.008 n=5+5)
BM_UValidate/4  [pdf              ]            2.87µs ± 0%             2.95µs ± 0%   +2.74%          (p=0.008 n=5+5)
BM_UIOVec/0     [html             ]             126µs ± 0%              124µs ± 0%   -1.50%          (p=0.008 n=5+5)
BM_UIOVec/1     [urls             ]             1.13ms ± 0%             1.11ms ± 0%   -1.95%          (p=0.008 n=5+5)
BM_UIOVec/2     [jpg              ]            6.31µs ± 3%             7.44µs ± 3%  +17.75%          (p=0.008 n=5+5)
BM_UIOVec/3     [jpg_200          ]              332ns ± 1%              318ns ± 1%   -4.22%          (p=0.008 n=5+5)
BM_UIOVec/4     [pdf              ]            12.7µs ± 3%             12.6µs ± 9%     ~             (p=0.222 n=5+5)
BM_UFlatSink/0  [html             ]            55.2µs ± 0%             49.0µs ± 0%  -11.31%          (p=0.008 n=5+5)
BM_UFlatSink/1  [urls             ]             612µs ± 0%              605µs ± 0%   -1.17%          (p=0.008 n=5+5)
BM_UFlatSink/2  [jpg              ]            6.29µs ±12%             6.57µs ± 9%     ~             (p=0.548 n=5+5)
BM_UFlatSink/3  [jpg_200          ]              138ns ± 2%              134ns ± 0%   -2.76%          (p=0.000 n=5+4)
BM_UFlatSink/4  [pdf              ]            8.35µs ± 0%             8.34µs ± 1%     ~             (p=0.905 n=4+5)
BM_UFlatSink/5  [html4            ]             239µs ± 0%              234µs ± 0%   -2.33%          (p=0.008 n=5+5)
BM_UFlatSink/6  [txt1             ]             211µs ± 0%              205µs ± 0%   -2.82%          (p=0.008 n=5+5)
BM_UFlatSink/7  [txt2             ]             185µs ± 0%              181µs ± 0%   -2.18%          (p=0.008 n=5+5)
BM_UFlatSink/8  [txt3             ]             560µs ± 0%              545µs ± 0%   -2.57%          (p=0.008 n=5+5)
BM_UFlatSink/9  [txt4             ]             773µs ± 0%              754µs ± 0%   -2.54%          (p=0.008 n=5+5)
BM_UFlatSink/10 [pb               ]            51.6µs ± 0%             45.3µs ± 0%  -12.19%          (p=0.008 n=5+5)
BM_UFlatSink/11 [gaviota          ]             209µs ± 0%              204µs ± 0%   -2.39%          (p=0.008 n=5+5)
BM_UFlatSink/12 [cp               ]            17.3µs ± 0%             15.6µs ± 0%   -9.98%          (p=0.008 n=5+5)
BM_UFlatSink/13 [c                ]            8.10µs ± 1%             7.98µs ± 0%   -1.53%          (p=0.008 n=5+5)
BM_UFlatSink/14 [lsp              ]            2.49µs ± 1%             2.47µs ± 0%   -0.84%          (p=0.008 n=5+5)
BM_UFlatSink/15 [xls              ]             968µs ± 0%              953µs ± 0%   -1.48%          (p=0.008 n=5+5)
BM_UFlatSink/16 [xls_200          ]              220ns ± 1%              220ns ± 0%     ~             (p=1.000 n=5+4)
BM_UFlatSink/17 [bin              ]             278µs ± 0%              275µs ± 0%   -0.99%          (p=0.008 n=5+5)
BM_UFlatSink/18 [bin_200          ]              102ns ± 1%              103ns ± 0%   +1.18%          (p=0.048 n=5+5)
BM_UFlatSink/19 [sum              ]            34.0µs ± 0%             30.9µs ± 0%   -9.21%          (p=0.008 n=5+5)
BM_UFlatSink/20 [man              ]            3.22µs ± 1%             3.20µs ± 0%   -0.76%          (p=0.032 n=5+5)
BM_ZFlat/0      [html (22.31 %)   ]             122µs ± 0%              122µs ± 0%     ~             (p=0.413 n=4+5)
BM_ZFlat/1      [urls (47.78 %)   ]             1.60ms ± 0%             1.60ms ± 0%   -0.06%          (p=0.032 n=5+5)
BM_ZFlat/2      [jpg (99.95 %)    ]            10.5µs ± 2%             10.7µs ± 9%     ~             (p=0.841 n=5+5)
BM_ZFlat/3      [jpg_200 (73.00 %)]              310ns ± 1%              309ns ± 3%     ~             (p=0.349 n=4+5)
BM_ZFlat/4      [pdf (83.30 %)    ]            13.5µs ± 1%             13.6µs ± 2%     ~             (p=0.595 n=5+5)
BM_ZFlat/5      [html4 (22.52 %)  ]             533µs ± 0%              532µs ± 0%   -0.08%          (p=0.032 n=5+5)
BM_ZFlat/6      [txt1 (57.88 %)   ]             529µs ± 0%              528µs ± 0%     ~             (p=0.222 n=5+5)
BM_ZFlat/7      [txt2 (61.91 %)   ]             469µs ± 0%              469µs ± 0%     ~             (p=0.690 n=5+5)
BM_ZFlat/8      [txt3 (54.99 %)   ]             1.40ms ± 0%             1.40ms ± 0%     ~             (p=0.548 n=5+5)
BM_ZFlat/9      [txt4 (66.26 %)   ]             1.93ms ± 0%             1.92ms ± 0%     ~             (p=0.421 n=5+5)
BM_ZFlat/10     [pb (19.68 %)     ]             106µs ± 0%              106µs ± 0%     ~             (p=0.548 n=5+5)
BM_ZFlat/11     [gaviota (37.72 %)]             404µs ± 0%              404µs ± 0%     ~             (p=0.841 n=5+5)
BM_ZFlat/12     [cp (48.12 %)     ]            43.2µs ± 0%             43.3µs ± 1%     ~             (p=0.151 n=5+5)
BM_ZFlat/13     [c (42.47 %)      ]            16.4µs ± 1%             16.4µs ± 0%     ~             (p=0.310 n=5+5)
BM_ZFlat/14     [lsp (48.37 %)    ]            4.96µs ± 0%             4.96µs ± 1%     ~             (p=0.651 n=5+5)
BM_ZFlat/15     [xls (41.23 %)    ]             1.54ms ± 0%             1.54ms ± 0%     ~             (p=0.841 n=5+5)
BM_ZFlat/16     [xls_200 (78.00 %)]              352ns ± 2%              351ns ± 1%     ~             (p=0.762 n=5+5)
BM_ZFlat/17     [bin (18.11 %)    ]             491µs ± 0%              491µs ± 0%     ~             (p=0.310 n=5+5)
BM_ZFlat/18     [bin_200 (7.50 %) ]             75.6ns ± 1%             77.2ns ± 0%   +2.06%          (p=0.016 n=5+4)
BM_ZFlat/19     [sum (48.96 %)    ]            76.9µs ± 0%             76.7µs ± 0%     ~             (p=0.222 n=5+5)
BM_ZFlat/20     [man (59.21 %)    ]            6.87µs ± 1%             6.81µs ± 0%   -0.87%          (p=0.008 n=5+5)

name                                          old speed               new speed               delta
BM_UFlat/0      [html             ]           1.85GB/s ± 0%           2.09GB/s ± 0%  +12.83%          (p=0.016 n=4+5)
BM_UFlat/1      [urls             ]           1.15GB/s ± 0%           1.16GB/s ± 0%   +1.25%          (p=0.008 n=5+5)
BM_UFlat/2      [jpg              ]           20.1GB/s ± 2%           20.3GB/s ± 1%     ~             (p=0.421 n=5+5)
BM_UFlat/3      [jpg_200          ]           1.49GB/s ± 0%           1.53GB/s ± 0%   +2.83%          (p=0.016 n=5+4)
BM_UFlat/4      [pdf              ]           12.2GB/s ± 2%           12.3GB/s ± 1%     ~             (p=0.222 n=5+5)
BM_UFlat/5      [html4            ]           1.71GB/s ± 0%           1.75GB/s ± 0%   +2.29%          (p=0.008 n=5+5)
BM_UFlat/6      [txt1             ]            722MB/s ± 0%            742MB/s ± 0%   +2.81%          (p=0.008 n=5+5)
BM_UFlat/7      [txt2             ]            676MB/s ± 0%            692MB/s ± 0%   +2.40%          (p=0.008 n=5+5)
BM_UFlat/8      [txt3             ]            762MB/s ± 0%            782MB/s ± 0%   +2.62%          (p=0.008 n=5+5)
BM_UFlat/9      [txt4             ]            623MB/s ± 0%            640MB/s ± 0%   +2.68%          (p=0.008 n=5+5)
BM_UFlat/10     [pb               ]           2.30GB/s ± 0%           2.62GB/s ± 0%  +13.99%          (p=0.008 n=5+5)
BM_UFlat/11     [gaviota          ]            883MB/s ± 0%            903MB/s ± 0%   +2.33%          (p=0.008 n=5+5)
BM_UFlat/12     [cp               ]           1.42GB/s ± 0%           1.57GB/s ± 1%  +10.57%          (p=0.008 n=5+5)
BM_UFlat/13     [c                ]           1.38GB/s ± 0%           1.39GB/s ± 0%   +1.00%          (p=0.008 n=5+5)
BM_UFlat/14     [lsp              ]           1.50GB/s ± 0%           1.52GB/s ± 0%   +1.12%          (p=0.008 n=5+5)
BM_UFlat/15     [xls              ]           1.06GB/s ± 0%           1.08GB/s ± 0%   +1.34%          (p=0.016 n=5+4)
BM_UFlat/16     [xls_200          ]            913MB/s ± 1%            918MB/s ± 1%     ~             (p=0.421 n=5+5)
BM_UFlat/17     [bin              ]           1.85GB/s ± 0%           1.86GB/s ± 0%   +0.92%          (p=0.008 n=5+5)
BM_UFlat/18     [bin_200          ]           2.01GB/s ± 0%           2.03GB/s ± 1%   +1.10%          (p=0.008 n=5+5)
BM_UFlat/19     [sum              ]           1.13GB/s ± 0%           1.24GB/s ± 0%   +9.99%          (p=0.008 n=5+5)
BM_UFlat/20     [man              ]           1.32GB/s ± 0%           1.32GB/s ± 1%     ~             (p=0.063 n=5+5)
BM_UValidate/0  [html             ]           3.10GB/s ± 0%           3.04GB/s ± 0%   -1.66%          (p=0.008 n=5+5)
BM_UValidate/1  [urls             ]           1.61GB/s ± 0%           1.59GB/s ± 0%   -1.04%          (p=0.008 n=5+5)
BM_UValidate/2  [jpg              ]            875GB/s ± 0%            866GB/s ± 0%   -1.11%          (p=0.008 n=5+5)
BM_UValidate/3  [jpg_200          ]           2.12GB/s ± 0%           2.10GB/s ± 0%   -1.01%          (p=0.016 n=5+4)
BM_UValidate/4  [pdf              ]           35.7GB/s ± 0%           34.7GB/s ± 0%   -2.66%          (p=0.008 n=5+5)
BM_UIOVec/0     [html             ]            813MB/s ± 0%            825MB/s ± 0%   +1.52%          (p=0.008 n=5+5)
BM_UIOVec/1     [urls             ]            622MB/s ± 0%            634MB/s ± 0%   +1.99%          (p=0.008 n=5+5)
BM_UIOVec/2     [jpg              ]           19.5GB/s ± 3%           16.6GB/s ± 3%  -15.08%          (p=0.008 n=5+5)
BM_UIOVec/3     [jpg_200          ]            603MB/s ± 1%            630MB/s ± 1%   +4.42%          (p=0.008 n=5+5)
BM_UIOVec/4     [pdf              ]           8.05GB/s ± 3%           8.12GB/s ± 8%     ~             (p=0.222 n=5+5)
BM_UFlatSink/0  [html             ]           1.85GB/s ± 0%           2.09GB/s ± 0%  +12.76%          (p=0.008 n=5+5)
BM_UFlatSink/1  [urls             ]           1.15GB/s ± 0%           1.16GB/s ± 0%   +1.18%          (p=0.008 n=5+5)
BM_UFlatSink/2  [jpg              ]           19.6GB/s ±11%           18.8GB/s ± 9%     ~             (p=0.548 n=5+5)
BM_UFlatSink/3  [jpg_200          ]           1.45GB/s ± 1%           1.49GB/s ± 0%   +2.82%          (p=0.016 n=5+4)
BM_UFlatSink/4  [pdf              ]           12.3GB/s ± 0%           12.3GB/s ± 1%     ~             (p=0.905 n=4+5)
BM_UFlatSink/5  [html4            ]           1.71GB/s ± 0%           1.75GB/s ± 0%   +2.41%          (p=0.008 n=5+5)
BM_UFlatSink/6  [txt1             ]            722MB/s ± 0%            743MB/s ± 0%   +2.90%          (p=0.008 n=5+5)
BM_UFlatSink/7  [txt2             ]            676MB/s ± 0%            691MB/s ± 0%   +2.23%          (p=0.008 n=5+5)
BM_UFlatSink/8  [txt3             ]            763MB/s ± 0%            783MB/s ± 0%   +2.64%          (p=0.008 n=5+5)
BM_UFlatSink/9  [txt4             ]            623MB/s ± 0%            639MB/s ± 0%   +2.61%          (p=0.008 n=5+5)
BM_UFlatSink/10 [pb               ]           2.30GB/s ± 0%           2.62GB/s ± 0%  +13.86%          (p=0.008 n=5+5)
BM_UFlatSink/11 [gaviota          ]            882MB/s ± 0%            904MB/s ± 0%   +2.45%          (p=0.008 n=5+5)
BM_UFlatSink/12 [cp               ]           1.42GB/s ± 0%           1.58GB/s ± 0%  +11.09%          (p=0.008 n=5+5)
BM_UFlatSink/13 [c                ]           1.38GB/s ± 1%           1.40GB/s ± 0%   +1.56%          (p=0.008 n=5+5)
BM_UFlatSink/14 [lsp              ]           1.50GB/s ± 1%           1.51GB/s ± 1%   +0.85%          (p=0.008 n=5+5)
BM_UFlatSink/15 [xls              ]           1.06GB/s ± 0%           1.08GB/s ± 0%   +1.51%          (p=0.016 n=5+4)
BM_UFlatSink/16 [xls_200          ]            908MB/s ± 1%            911MB/s ± 0%     ~             (p=0.730 n=5+4)
BM_UFlatSink/17 [bin              ]           1.85GB/s ± 0%           1.86GB/s ± 0%   +1.01%          (p=0.008 n=5+5)
BM_UFlatSink/18 [bin_200          ]           1.96GB/s ± 1%           1.94GB/s ± 1%   -1.18%          (p=0.016 n=5+5)
BM_UFlatSink/19 [sum              ]           1.12GB/s ± 0%           1.24GB/s ± 0%  +10.16%          (p=0.008 n=5+5)
BM_UFlatSink/20 [man              ]           1.31GB/s ± 1%           1.32GB/s ± 0%   +0.77%          (p=0.048 n=5+5)
BM_ZFlat/0      [html (22.31 %)   ]            839MB/s ± 0%            839MB/s ± 0%     ~             (p=0.413 n=4+5)
BM_ZFlat/1      [urls (47.78 %)   ]            439MB/s ± 0%            439MB/s ± 0%   +0.06%          (p=0.032 n=5+5)
BM_ZFlat/2      [jpg (99.95 %)    ]           11.7GB/s ± 2%           11.5GB/s ± 9%     ~             (p=0.841 n=5+5)
BM_ZFlat/3      [jpg_200 (73.00 %)]            645MB/s ± 1%            647MB/s ± 3%     ~             (p=0.413 n=4+5)
BM_ZFlat/4      [pdf (83.30 %)    ]           7.57GB/s ± 1%           7.54GB/s ± 2%     ~             (p=0.595 n=5+5)
BM_ZFlat/5      [html4 (22.52 %)  ]            769MB/s ± 0%            770MB/s ± 0%   +0.08%          (p=0.032 n=5+5)
BM_ZFlat/6      [txt1 (57.88 %)   ]            288MB/s ± 0%            288MB/s ± 0%     ~             (p=0.222 n=5+5)
BM_ZFlat/7      [txt2 (61.91 %)   ]            267MB/s ± 0%            267MB/s ± 0%     ~             (p=0.690 n=5+5)
BM_ZFlat/8      [txt3 (54.99 %)   ]            305MB/s ± 0%            305MB/s ± 0%     ~             (p=0.548 n=5+5)
BM_ZFlat/9      [txt4 (66.26 %)   ]            250MB/s ± 0%            251MB/s ± 0%     ~             (p=0.421 n=5+5)
BM_ZFlat/10     [pb (19.68 %)     ]           1.12GB/s ± 0%           1.12GB/s ± 0%     ~             (p=0.635 n=5+5)
BM_ZFlat/11     [gaviota (37.72 %)]            457MB/s ± 0%            457MB/s ± 0%     ~             (p=0.841 n=5+5)
BM_ZFlat/12     [cp (48.12 %)     ]            570MB/s ± 0%            568MB/s ± 1%     ~             (p=0.151 n=5+5)
BM_ZFlat/13     [c (42.47 %)      ]            682MB/s ± 1%            681MB/s ± 0%     ~             (p=0.310 n=5+5)
BM_ZFlat/14     [lsp (48.37 %)    ]            750MB/s ± 0%            751MB/s ± 1%     ~             (p=0.690 n=5+5)
BM_ZFlat/15     [xls (41.23 %)    ]            668MB/s ± 0%            668MB/s ± 0%     ~             (p=0.841 n=5+5)
BM_ZFlat/16     [xls_200 (78.00 %)]            569MB/s ± 2%            570MB/s ± 1%     ~             (p=0.841 n=5+5)
BM_ZFlat/17     [bin (18.11 %)    ]           1.04GB/s ± 0%           1.04GB/s ± 0%     ~             (p=0.310 n=5+5)
BM_ZFlat/18     [bin_200 (7.50 %) ]           2.64GB/s ± 1%           2.59GB/s ± 0%   -1.99%          (p=0.016 n=5+4)
BM_ZFlat/19     [sum (48.96 %)    ]            497MB/s ± 0%            498MB/s ± 0%     ~             (p=0.222 n=5+5)
BM_ZFlat/20     [man (59.21 %)    ]            615MB/s ± 1%            621MB/s ± 0%   +0.87%          (p=0.008 n=5+5)

K8
--

name                                          old time/op             new time/op             delta
BM_UFlat/0      [html             ]            41.7µs ± 0%             41.7µs ± 0%    ~             (p=0.841 n=5+5)
BM_UFlat/1      [urls             ]             588µs ± 0%              588µs ± 0%    ~             (p=0.310 n=5+5)
BM_UFlat/2      [jpg              ]            7.11µs ± 1%             7.10µs ± 1%    ~             (p=0.556 n=5+4)
BM_UFlat/3      [jpg_200          ]              130ns ± 0%              130ns ± 0%    ~     (all samples are equal)
BM_UFlat/4      [pdf              ]            8.19µs ± 0%             8.26µs ± 2%    ~             (p=0.460 n=5+5)
BM_UFlat/5      [html4            ]             219µs ± 0%              219µs ± 0%    ~             (p=1.000 n=5+5)
BM_UFlat/6      [txt1             ]             192µs ± 0%              191µs ± 0%    ~             (p=0.341 n=5+5)
BM_UFlat/7      [txt2             ]             170µs ± 0%              170µs ± 0%    ~             (p=0.841 n=5+5)
BM_UFlat/8      [txt3             ]             509µs ± 0%              509µs ± 0%    ~             (p=0.151 n=5+5)
BM_UFlat/9      [txt4             ]             712µs ± 0%              712µs ± 0%    ~             (p=0.841 n=5+5)
BM_UFlat/10     [pb               ]            38.5µs ± 0%             38.5µs ± 0%    ~             (p=0.452 n=5+5)
BM_UFlat/11     [gaviota          ]             189µs ± 0%              189µs ± 0%    ~             (p=0.841 n=5+5)
BM_UFlat/12     [cp               ]            14.2µs ± 1%             14.2µs ± 0%    ~             (p=0.889 n=5+5)
BM_UFlat/13     [c                ]            7.32µs ± 0%             7.33µs ± 0%    ~             (p=1.000 n=5+5)
BM_UFlat/14     [lsp              ]            2.26µs ± 0%             2.27µs ± 0%    ~             (p=0.222 n=4+5)
BM_UFlat/15     [xls              ]             954µs ± 0%              955µs ± 0%    ~             (p=0.222 n=5+5)
BM_UFlat/16     [xls_200          ]              215ns ± 4%              212ns ± 0%    ~             (p=0.095 n=5+4)
BM_UFlat/17     [bin              ]             276µs ± 0%              276µs ± 0%    ~             (p=0.841 n=5+5)
BM_UFlat/18     [bin_200          ]              104ns ±10%              103ns ± 3%    ~             (p=0.825 n=5+5)
BM_UFlat/19     [sum              ]            29.2µs ± 0%             29.2µs ± 0%    ~             (p=0.690 n=5+5)
BM_UFlat/20     [man              ]            2.96µs ± 0%             2.97µs ± 0%  +0.43%          (p=0.032 n=5+5)
BM_UValidate/0  [html             ]            33.4µs ± 0%             33.4µs ± 0%    ~             (p=0.151 n=5+5)
BM_UValidate/1  [urls             ]             441µs ± 0%              441µs ± 0%    ~             (p=0.548 n=5+5)
BM_UValidate/2  [jpg              ]              146ns ± 0%              146ns ± 0%    ~     (all samples are equal)
BM_UValidate/3  [jpg_200          ]             98.0ns ± 0%             98.0ns ± 0%    ~             (p=1.000 n=5+5)
BM_UValidate/4  [pdf              ]            2.89µs ± 0%             2.89µs ± 0%    ~             (p=0.794 n=5+5)
BM_UIOVec/0     [html             ]             121µs ± 0%              121µs ± 0%    ~             (p=0.151 n=5+5)
BM_UIOVec/1     [urls             ]             1.08ms ± 0%             1.08ms ± 0%    ~             (p=0.095 n=5+5)
BM_UIOVec/2     [jpg              ]            7.47µs ± 5%             7.31µs ± 2%    ~             (p=0.222 n=5+5)
BM_UIOVec/3     [jpg_200          ]              330ns ± 0%              330ns ± 0%    ~     (all samples are equal)
BM_UIOVec/4     [pdf              ]            12.3µs ± 2%             12.0µs ± 0%    ~             (p=0.063 n=5+5)
BM_UFlatSink/0  [html             ]            41.6µs ± 0%             41.6µs ± 0%    ~             (p=0.095 n=5+5)
BM_UFlatSink/1  [urls             ]             589µs ± 0%              589µs ± 0%    ~             (p=1.000 n=5+5)
BM_UFlatSink/2  [jpg              ]            7.84µs ±26%             7.23µs ± 5%    ~             (p=0.690 n=5+5)
BM_UFlatSink/3  [jpg_200          ]              132ns ± 0%              132ns ± 0%    ~     (all samples are equal)
BM_UFlatSink/4  [pdf              ]            8.43µs ± 3%             8.27µs ± 2%    ~             (p=0.254 n=5+5)
BM_UFlatSink/5  [html4            ]             219µs ± 0%              219µs ± 0%    ~             (p=0.524 n=5+5)
BM_UFlatSink/6  [txt1             ]             192µs ± 0%              192µs ± 0%    ~             (p=0.690 n=5+5)
BM_UFlatSink/7  [txt2             ]             170µs ± 0%              170µs ± 0%    ~             (p=0.421 n=5+5)
BM_UFlatSink/8  [txt3             ]             509µs ± 0%              509µs ± 0%    ~             (p=0.310 n=5+5)
BM_UFlatSink/9  [txt4             ]             712µs ± 0%              712µs ± 0%    ~             (p=0.841 n=5+5)
BM_UFlatSink/10 [pb               ]            38.5µs ± 0%             38.5µs ± 0%    ~             (p=0.421 n=5+5)
BM_UFlatSink/11 [gaviota          ]             189µs ± 0%              189µs ± 0%    ~             (p=1.000 n=5+5)
BM_UFlatSink/12 [cp               ]            14.2µs ± 0%             14.2µs ± 0%    ~             (p=0.421 n=5+5)
BM_UFlatSink/13 [c                ]            7.37µs ± 1%             7.36µs ± 1%    ~             (p=0.746 n=5+5)
BM_UFlatSink/14 [lsp              ]            2.27µs ± 0%             2.27µs ± 1%    ~             (p=0.714 n=5+5)
BM_UFlatSink/15 [xls              ]             954µs ± 0%              954µs ± 0%    ~             (p=1.000 n=5+5)
BM_UFlatSink/16 [xls_200          ]              215ns ± 1%              215ns ± 1%    ~             (p=0.921 n=5+5)
BM_UFlatSink/17 [bin              ]             276µs ± 0%              276µs ± 0%    ~             (p=1.000 n=5+5)
BM_UFlatSink/18 [bin_200          ]              103ns ± 2%              104ns ± 1%    ~             (p=0.429 n=5+5)
BM_UFlatSink/19 [sum              ]            29.2µs ± 0%             29.2µs ± 0%    ~             (p=0.452 n=5+5)
BM_UFlatSink/20 [man              ]            2.96µs ± 0%             2.97µs ± 1%    ~             (p=0.484 n=5+5)
BM_ZFlat/0      [html (22.31 %)   ]             126µs ± 0%              126µs ± 0%    ~             (p=1.000 n=5+5)
BM_ZFlat/1      [urls (47.78 %)   ]             1.67ms ± 0%             1.67ms ± 0%    ~             (p=0.841 n=5+5)
BM_ZFlat/2      [jpg (99.95 %)    ]            11.6µs ± 4%             11.6µs ± 3%    ~             (p=1.000 n=5+5)
BM_ZFlat/3      [jpg_200 (73.00 %)]              368ns ± 1%              367ns ± 0%    ~             (p=0.159 n=5+5)
BM_ZFlat/4      [pdf (83.30 %)    ]            14.7µs ± 1%             14.6µs ± 0%    ~             (p=0.190 n=5+4)
BM_ZFlat/5      [html4 (22.52 %)  ]             550µs ± 0%              550µs ± 0%    ~             (p=0.841 n=5+5)
BM_ZFlat/6      [txt1 (57.88 %)   ]             540µs ± 0%              540µs ± 0%    ~             (p=0.310 n=5+5)
BM_ZFlat/7      [txt2 (61.91 %)   ]             479µs ± 0%              480µs ± 0%    ~             (p=1.000 n=5+5)
BM_ZFlat/8      [txt3 (54.99 %)   ]             1.44ms ± 0%             1.44ms ± 0%    ~             (p=0.421 n=5+5)
BM_ZFlat/9      [txt4 (66.26 %)   ]             1.97ms ± 0%             1.97ms ± 0%    ~             (p=0.421 n=5+5)
BM_ZFlat/10     [pb (19.68 %)     ]             110µs ± 0%              109µs ± 0%    ~             (p=0.730 n=5+4)
BM_ZFlat/11     [gaviota (37.72 %)]             412µs ± 0%              412µs ± 0%    ~             (p=1.000 n=5+5)
BM_ZFlat/12     [cp (48.12 %)     ]            46.3µs ± 0%             46.3µs ± 1%    ~             (p=0.841 n=5+5)
BM_ZFlat/13     [c (42.47 %)      ]            17.7µs ± 0%             17.7µs ± 1%    ~             (p=0.841 n=5+5)
BM_ZFlat/14     [lsp (48.37 %)    ]            5.54µs ± 1%             5.55µs ± 0%    ~             (p=0.254 n=5+4)
BM_ZFlat/15     [xls (41.23 %)    ]             1.62ms ± 0%             1.63ms ± 0%    ~             (p=0.151 n=5+5)
BM_ZFlat/16     [xls_200 (78.00 %)]              395ns ± 2%              394ns ± 1%    ~             (p=1.000 n=5+5)
BM_ZFlat/17     [bin (18.11 %)    ]             507µs ± 0%              507µs ± 0%    ~             (p=0.056 n=5+5)
BM_ZFlat/18     [bin_200 (7.50 %) ]             89.6ns ± 5%             89.8ns ± 5%    ~             (p=1.000 n=5+5)
BM_ZFlat/19     [sum (48.96 %)    ]            79.9µs ± 0%             79.9µs ± 0%    ~             (p=0.690 n=5+5)
BM_ZFlat/20     [man (59.21 %)    ]            7.67µs ± 0%             7.67µs ± 1%    ~             (p=0.548 n=5+5)

name                                          old speed               new speed               delta
BM_UFlat/0      [html             ]           2.45GB/s ± 0%           2.45GB/s ± 0%    ~             (p=0.889 n=5+5)
BM_UFlat/1      [urls             ]           1.19GB/s ± 0%           1.19GB/s ± 0%    ~     (all samples are equal)
BM_UFlat/2      [jpg              ]           17.3GB/s ± 1%           17.3GB/s ± 1%    ~             (p=0.556 n=5+4)
BM_UFlat/3      [jpg_200          ]           1.54GB/s ± 0%           1.54GB/s ± 0%    ~             (p=0.833 n=5+5)
BM_UFlat/4      [pdf              ]           12.5GB/s ± 0%           12.4GB/s ± 2%    ~             (p=0.421 n=5+5)
BM_UFlat/5      [html4            ]           1.87GB/s ± 0%           1.87GB/s ± 0%    ~             (p=1.000 n=4+5)
BM_UFlat/6      [txt1             ]            794MB/s ± 0%            794MB/s ± 0%    ~             (p=0.310 n=5+5)
BM_UFlat/7      [txt2             ]            738MB/s ± 0%            738MB/s ± 0%    ~             (p=0.841 n=5+5)
BM_UFlat/8      [txt3             ]            839MB/s ± 0%            838MB/s ± 0%    ~             (p=0.151 n=5+5)
BM_UFlat/9      [txt4             ]            677MB/s ± 0%            677MB/s ± 0%    ~             (p=0.841 n=5+5)
BM_UFlat/10     [pb               ]           3.08GB/s ± 0%           3.08GB/s ± 0%    ~             (p=0.452 n=5+5)
BM_UFlat/11     [gaviota          ]            975MB/s ± 0%            975MB/s ± 0%    ~             (p=0.841 n=5+5)
BM_UFlat/12     [cp               ]           1.73GB/s ± 1%           1.73GB/s ± 0%    ~             (p=0.984 n=5+5)
BM_UFlat/13     [c                ]           1.52GB/s ± 0%           1.52GB/s ± 0%    ~             (p=0.841 n=5+5)
BM_UFlat/14     [lsp              ]           1.64GB/s ± 0%           1.64GB/s ± 0%    ~             (p=0.254 n=4+5)
BM_UFlat/15     [xls              ]           1.08GB/s ± 0%           1.08GB/s ± 0%    ~             (p=0.095 n=5+4)
BM_UFlat/16     [xls_200          ]            931MB/s ± 4%            941MB/s ± 0%    ~             (p=0.151 n=5+5)
BM_UFlat/17     [bin              ]           1.86GB/s ± 0%           1.86GB/s ± 0%    ~             (p=0.762 n=5+5)
BM_UFlat/18     [bin_200          ]           1.92GB/s ± 9%           1.95GB/s ± 3%    ~             (p=1.000 n=5+5)
BM_UFlat/19     [sum              ]           1.31GB/s ± 1%           1.31GB/s ± 0%    ~             (p=0.548 n=5+5)
BM_UFlat/20     [man              ]           1.43GB/s ± 0%           1.42GB/s ± 1%  -0.42%          (p=0.040 n=5+5)
BM_UValidate/0  [html             ]           3.06GB/s ± 0%           3.06GB/s ± 0%    ~             (p=0.151 n=5+5)
BM_UValidate/1  [urls             ]           1.59GB/s ± 0%           1.59GB/s ± 0%    ~             (p=0.357 n=5+5)
BM_UValidate/2  [jpg              ]            845GB/s ± 0%            845GB/s ± 0%    ~             (p=0.548 n=5+5)
BM_UValidate/3  [jpg_200          ]           2.04GB/s ± 0%           2.04GB/s ± 0%    ~             (p=1.000 n=5+5)
BM_UValidate/4  [pdf              ]           35.4GB/s ± 0%           35.4GB/s ± 0%    ~             (p=0.421 n=5+5)
BM_UIOVec/0     [html             ]            845MB/s ± 0%            845MB/s ± 0%    ~             (p=0.151 n=5+5)
BM_UIOVec/1     [urls             ]            650MB/s ± 0%            650MB/s ± 0%    ~             (p=0.087 n=5+5)
BM_UIOVec/2     [jpg              ]           16.5GB/s ± 5%           16.8GB/s ± 2%    ~             (p=0.222 n=5+5)
BM_UIOVec/3     [jpg_200          ]            605MB/s ± 0%            605MB/s ± 0%    ~             (p=0.690 n=5+5)
BM_UIOVec/4     [pdf              ]           8.36GB/s ± 2%           8.54GB/s ± 0%    ~             (p=0.063 n=5+5)
BM_UFlatSink/0  [html             ]           2.46GB/s ± 0%           2.46GB/s ± 0%    ~             (p=0.063 n=5+5)
BM_UFlatSink/1  [urls             ]           1.19GB/s ± 0%           1.19GB/s ± 0%    ~     (all samples are equal)
BM_UFlatSink/2  [jpg              ]           16.0GB/s ±22%           17.0GB/s ± 5%    ~             (p=0.690 n=5+5)
BM_UFlatSink/3  [jpg_200          ]           1.51GB/s ± 0%           1.51GB/s ± 2%    ~             (p=1.000 n=5+5)
BM_UFlatSink/4  [pdf              ]           12.2GB/s ± 3%           12.4GB/s ± 2%    ~             (p=0.254 n=5+5)
BM_UFlatSink/5  [html4            ]           1.87GB/s ± 0%           1.87GB/s ± 0%    ~             (p=0.532 n=5+5)
BM_UFlatSink/6  [txt1             ]            794MB/s ± 0%            794MB/s ± 0%    ~             (p=0.690 n=5+5)
BM_UFlatSink/7  [txt2             ]            738MB/s ± 0%            738MB/s ± 0%    ~             (p=0.421 n=5+5)
BM_UFlatSink/8  [txt3             ]            838MB/s ± 0%            838MB/s ± 0%    ~             (p=0.310 n=5+5)
BM_UFlatSink/9  [txt4             ]            676MB/s ± 0%            676MB/s ± 0%    ~             (p=0.841 n=5+5)
BM_UFlatSink/10 [pb               ]           3.08GB/s ± 0%           3.08GB/s ± 0%    ~             (p=0.365 n=5+5)
BM_UFlatSink/11 [gaviota          ]            975MB/s ± 0%            975MB/s ± 0%    ~             (p=1.000 n=5+5)
BM_UFlatSink/12 [cp               ]           1.73GB/s ± 0%           1.74GB/s ± 0%    ~             (p=0.286 n=5+5)
BM_UFlatSink/13 [c                ]           1.51GB/s ± 1%           1.52GB/s ± 1%    ~             (p=0.683 n=5+5)
BM_UFlatSink/14 [lsp              ]           1.64GB/s ± 0%           1.64GB/s ± 0%    ~             (p=0.444 n=5+5)
BM_UFlatSink/15 [xls              ]           1.08GB/s ± 0%           1.08GB/s ± 0%    ~             (p=0.333 n=4+5)
BM_UFlatSink/16 [xls_200          ]            930MB/s ± 1%            930MB/s ± 1%    ~             (p=0.841 n=5+5)
BM_UFlatSink/17 [bin              ]           1.86GB/s ± 0%           1.86GB/s ± 0%    ~             (p=1.000 n=5+5)
BM_UFlatSink/18 [bin_200          ]           1.93GB/s ± 2%           1.93GB/s ± 1%    ~             (p=0.651 n=5+5)
BM_UFlatSink/19 [sum              ]           1.31GB/s ± 0%           1.31GB/s ± 0%    ~             (p=0.508 n=5+5)
BM_UFlatSink/20 [man              ]           1.43GB/s ± 0%           1.42GB/s ± 1%    ~             (p=0.524 n=5+5)
BM_ZFlat/0      [html (22.31 %)   ]            815MB/s ± 0%            815MB/s ± 0%    ~             (p=1.000 n=5+5)
BM_ZFlat/1      [urls (47.78 %)   ]            420MB/s ± 0%            420MB/s ± 0%    ~             (p=0.841 n=5+5)
BM_ZFlat/2      [jpg (99.95 %)    ]           10.6GB/s ± 4%           10.6GB/s ± 3%    ~             (p=1.000 n=5+5)
BM_ZFlat/3      [jpg_200 (73.00 %)]            543MB/s ± 1%            546MB/s ± 0%    ~             (p=0.095 n=5+5)
BM_ZFlat/4      [pdf (83.30 %)    ]           6.96GB/s ± 1%           7.01GB/s ± 0%    ~             (p=0.190 n=5+4)
BM_ZFlat/5      [html4 (22.52 %)  ]            745MB/s ± 0%            745MB/s ± 0%    ~             (p=0.841 n=5+5)
BM_ZFlat/6      [txt1 (57.88 %)   ]            282MB/s ± 0%            282MB/s ± 0%    ~             (p=0.310 n=5+5)
BM_ZFlat/7      [txt2 (61.91 %)   ]            261MB/s ± 0%            261MB/s ± 0%    ~             (p=1.000 n=5+5)
BM_ZFlat/8      [txt3 (54.99 %)   ]            297MB/s ± 0%            297MB/s ± 0%    ~             (p=0.421 n=5+5)
BM_ZFlat/9      [txt4 (66.26 %)   ]            244MB/s ± 0%            244MB/s ± 0%    ~             (p=0.389 n=5+5)
BM_ZFlat/10     [pb (19.68 %)     ]           1.08GB/s ± 0%           1.08GB/s ± 0%    ~             (p=0.238 n=5+4)
BM_ZFlat/11     [gaviota (37.72 %)]            448MB/s ± 0%            447MB/s ± 0%    ~             (p=1.000 n=5+5)
BM_ZFlat/12     [cp (48.12 %)     ]            532MB/s ± 0%            531MB/s ± 1%    ~             (p=0.841 n=5+5)
BM_ZFlat/13     [c (42.47 %)      ]            632MB/s ± 0%            631MB/s ± 1%    ~             (p=0.841 n=5+5)
BM_ZFlat/14     [lsp (48.37 %)    ]            672MB/s ± 1%            671MB/s ± 0%    ~             (p=0.286 n=5+4)
BM_ZFlat/15     [xls (41.23 %)    ]            634MB/s ± 0%            633MB/s ± 0%    ~             (p=0.151 n=5+5)
BM_ZFlat/16     [xls_200 (78.00 %)]            507MB/s ± 2%            508MB/s ± 1%    ~             (p=1.000 n=5+5)
BM_ZFlat/17     [bin (18.11 %)    ]           1.01GB/s ± 0%           1.01GB/s ± 0%    ~             (p=0.056 n=5+5)
BM_ZFlat/18     [bin_200 (7.50 %) ]           2.24GB/s ± 5%           2.23GB/s ± 5%    ~             (p=0.889 n=5+5)
BM_ZFlat/19     [sum (48.96 %)    ]            479MB/s ± 0%            479MB/s ± 0%    ~             (p=0.690 n=5+5)
BM_ZFlat/20     [man (59.21 %)    ]            551MB/s ± 0%            551MB/s ± 1%    ~             (p=0.548 n=5+5)
This commit is contained in:
atdt 2018-12-12 07:14:02 -08:00 committed by Victor Costan
parent eb47f79631
commit 136b3ebc31
1 changed files with 11 additions and 8 deletions

View File

@ -42,9 +42,7 @@
#endif // !defined(SNAPPY_HAVE_SSSE3) #endif // !defined(SNAPPY_HAVE_SSSE3)
#if SNAPPY_HAVE_SSSE3 #if SNAPPY_HAVE_SSSE3
// Please do not replace with <x86intrin.h>. or with headers that assume more #include <x86intrin.h>
// advanced SSE versions without checking with all the OWNERS.
#include <tmmintrin.h>
#endif #endif
#include <stdio.h> #include <stdio.h>
@ -694,14 +692,17 @@ static inline void Report(const char *algorithm, size_t compressed_size,
// bool TryFastAppend(const char* ip, size_t available, size_t length); // bool TryFastAppend(const char* ip, size_t available, size_t length);
// }; // };
// Mapping from n in range [0,4] to a mask to extract the bottom 8*n bits. static inline uint32 ExtractLowBytes(uint32 v, int n) {
static inline uint32 WordMask(int n) {
DCHECK_GE(n, 0); DCHECK_GE(n, 0);
DCHECK_LE(n, 4); DCHECK_LE(n, 4);
#ifdef __BMI2__
return _bzhi_u32(v, 8 * n);
#else
// This needs to be wider than uint32 otherwise `mask << 32` will be // This needs to be wider than uint32 otherwise `mask << 32` will be
// undefined. // undefined.
uint64 mask = 0xffffffff; uint64 mask = 0xffffffff;
return ~(mask << (8 * n)); return v & ~(mask << (8 * n));
#endif
} }
// Helper class for decompression // Helper class for decompression
@ -831,7 +832,8 @@ class SnappyDecompressor {
// Long literal. // Long literal.
const size_t literal_length_length = literal_length - 60; const size_t literal_length_length = literal_length - 60;
literal_length = literal_length =
(LittleEndian::Load32(ip) & WordMask(literal_length_length)) + 1; ExtractLowBytes(LittleEndian::Load32(ip), literal_length_length) +
1;
ip += literal_length_length; ip += literal_length_length;
} }
@ -854,7 +856,8 @@ class SnappyDecompressor {
MAYBE_REFILL(); MAYBE_REFILL();
} else { } else {
const size_t entry = char_table[c]; const size_t entry = char_table[c];
const size_t trailer = LittleEndian::Load32(ip) & WordMask(entry >> 11); const size_t trailer =
ExtractLowBytes(LittleEndian::Load32(ip), entry >> 11);
const size_t length = entry & 0xff; const size_t length = entry & 0xff;
ip += entry >> 11; ip += entry >> 11;