Reduce the LeftShiftOverflows() table size.

A previous CL introduced LeftShiftOverflows(), which takes a uint32
input. However, the value it operates on is guaranteed to only have 8
bits set. This CL takes advantage of this restriction to reduce the size
of the static table used to compute LeftShiftOverflows().

The same methodology as the previous CL suggests a 0.6% improvement. The
improvement is likely bigger on mobile CPUs that have much smaller
caches.

Benchmark results:

name                                          old time/op             new time/op             delta
BM_UFlat/0      [html             ]            42.5µs ± 1%             42.1µs ± 0%  -0.87%        (p=0.000 n=20+20)
BM_UFlat/1      [urls             ]             575µs ± 0%              574µs ± 0%  -0.16%        (p=0.000 n=20+19)
BM_UFlat/2      [jpg              ]            7.13µs ± 1%             7.20µs ± 5%    ~           (p=0.422 n=16+19)
BM_UFlat/3      [jpg_200          ]              129ns ± 0%              130ns ± 0%  +0.82%        (p=0.000 n=20+17)
BM_UFlat/4      [pdf              ]            8.22µs ± 1%             8.21µs ± 0%    ~           (p=0.586 n=17+17)
BM_UFlat/5      [html4            ]             222µs ± 0%              222µs ± 0%  -0.11%        (p=0.047 n=19+20)
BM_UFlat/6      [txt1             ]             192µs ± 0%              191µs ± 0%  -0.69%        (p=0.000 n=20+20)
BM_UFlat/7      [txt2             ]             169µs ± 0%              169µs ± 0%  -0.28%        (p=0.000 n=20+20)
BM_UFlat/8      [txt3             ]             510µs ± 0%              507µs ± 0%  -0.50%        (p=0.000 n=20+20)
BM_UFlat/9      [txt4             ]             707µs ± 0%              703µs ± 0%  -0.53%        (p=0.000 n=20+20)
BM_UFlat/10     [pb               ]            39.1µs ± 0%             38.5µs ± 0%  -1.56%        (p=0.000 n=20+20)
BM_UFlat/11     [gaviota          ]             189µs ± 0%              189µs ± 0%  -0.42%        (p=0.000 n=20+20)
BM_UFlat/12     [cp               ]            14.2µs ± 0%             14.2µs ± 1%  -0.30%        (p=0.001 n=18+19)
BM_UFlat/13     [c                ]            7.29µs ± 0%             7.34µs ± 1%  +0.59%        (p=0.000 n=19+20)
BM_UFlat/14     [lsp              ]            2.28µs ± 0%             2.29µs ± 1%  +0.39%        (p=0.000 n=19+18)
BM_UFlat/15     [xls              ]             905µs ± 0%              904µs ± 0%  -0.12%        (p=0.030 n=20+20)
BM_UFlat/16     [xls_200          ]              213ns ± 2%              215ns ± 4%  +0.92%        (p=0.011 n=20+20)
BM_UFlat/17     [bin              ]             274µs ± 0%              275µs ± 0%  +0.55%        (p=0.000 n=20+20)
BM_UFlat/18     [bin_200          ]              101ns ± 1%              101ns ± 1%    ~           (p=0.913 n=18+18)
BM_UFlat/19     [sum              ]            27.9µs ± 1%             27.5µs ± 1%  -1.38%        (p=0.000 n=20+20)
BM_UFlat/20     [man              ]            2.97µs ± 1%             2.97µs ± 1%    ~           (p=0.835 n=20+19)
BM_UValidate/0  [html             ]            33.5µs ± 0%             34.2µs ± 0%  +2.32%        (p=0.000 n=20+20)
BM_UValidate/1  [urls             ]             441µs ± 0%              442µs ± 0%  +0.15%        (p=0.010 n=20+20)
BM_UValidate/2  [jpg              ]              144ns ± 0%              146ns ± 0%  +1.32%        (p=0.000 n=20+20)
BM_UValidate/3  [jpg_200          ]             95.3ns ± 0%             96.0ns ± 0%  +0.68%        (p=0.000 n=20+20)
BM_UValidate/4  [pdf              ]            2.86µs ± 0%             2.88µs ± 1%  +0.67%        (p=0.000 n=19+19)
BM_UIOVec/0     [html             ]             122µs ± 0%              122µs ± 0%  -0.25%        (p=0.000 n=20+20)
BM_UIOVec/1     [urls             ]             1.08ms ± 0%             1.08ms ± 0%    ~           (p=0.068 n=20+20)
BM_UIOVec/2     [jpg              ]            7.63µs ± 7%             7.76µs ±11%    ~           (p=0.396 n=19+20)
BM_UIOVec/3     [jpg_200          ]              325ns ± 0%              326ns ± 0%  +0.27%        (p=0.000 n=20+18)
BM_UIOVec/4     [pdf              ]            12.1µs ± 2%             12.1µs ± 3%    ~           (p=0.967 n=19+20)
BM_UFlatSink/0  [html             ]            42.4µs ± 0%             42.1µs ± 0%  -0.89%        (p=0.000 n=20+20)
BM_UFlatSink/1  [urls             ]             575µs ± 0%              575µs ± 0%    ~           (p=0.883 n=20+20)
BM_UFlatSink/2  [jpg              ]            7.58µs ±16%             7.52µs ±15%    ~           (p=0.945 n=19+20)
BM_UFlatSink/3  [jpg_200          ]              133ns ± 4%              133ns ± 4%    ~           (p=0.627 n=19+20)
BM_UFlatSink/4  [pdf              ]            8.29µs ± 4%             8.39µs ± 4%  +1.14%        (p=0.013 n=19+18)
BM_UFlatSink/5  [html4            ]             223µs ± 0%              222µs ± 0%  -0.18%        (p=0.001 n=20+20)
BM_UFlatSink/6  [txt1             ]             192µs ± 0%              191µs ± 0%  -0.71%        (p=0.000 n=20+20)
BM_UFlatSink/7  [txt2             ]             169µs ± 0%              169µs ± 0%  -0.26%        (p=0.000 n=20+20)
BM_UFlatSink/8  [txt3             ]             510µs ± 0%              508µs ± 0%  -0.50%        (p=0.000 n=20+20)
BM_UFlatSink/9  [txt4             ]             707µs ± 0%              704µs ± 0%  -0.44%        (p=0.000 n=20+20)
BM_UFlatSink/10 [pb               ]            39.1µs ± 0%             38.5µs ± 1%  -1.62%        (p=0.000 n=19+20)
BM_UFlatSink/11 [gaviota          ]             189µs ± 0%              189µs ± 0%  -0.39%        (p=0.000 n=20+20)
BM_UFlatSink/12 [cp               ]            14.2µs ± 0%             14.2µs ± 1%    ~           (p=0.435 n=19+19)
BM_UFlatSink/13 [c                ]            7.29µs ± 0%             7.33µs ± 1%  +0.57%        (p=0.000 n=19+20)
BM_UFlatSink/14 [lsp              ]            2.29µs ± 0%             2.29µs ± 1%    ~           (p=0.791 n=18+18)
BM_UFlatSink/15 [xls              ]             903µs ± 0%              902µs ± 0%  -0.11%        (p=0.044 n=20+19)
BM_UFlatSink/16 [xls_200          ]              215ns ± 1%              215ns ± 1%    ~           (p=0.885 n=19+19)
BM_UFlatSink/17 [bin              ]             274µs ± 0%              275µs ± 0%  +0.51%        (p=0.000 n=20+20)
BM_UFlatSink/18 [bin_200          ]              103ns ± 2%              103ns ± 0%  -0.41%        (p=0.016 n=20+15)
BM_UFlatSink/19 [sum              ]            27.9µs ± 1%             27.5µs ± 1%  -1.34%        (p=0.000 n=20+19)
BM_UFlatSink/20 [man              ]            2.98µs ± 1%             2.97µs ± 1%    ~           (p=0.358 n=18+19)
BM_ZFlat/0      [html (22.31 %)   ]             126µs ± 0%              126µs ± 0%  +0.14%        (p=0.011 n=20+20)
BM_ZFlat/1      [urls (47.78 %)   ]             1.67ms ± 0%             1.67ms ± 0%  +0.11%        (p=0.043 n=20+20)
BM_ZFlat/2      [jpg (99.95 %)    ]            11.5µs ± 6%             11.7µs ± 7%    ~           (p=0.142 n=20+20)
BM_ZFlat/3      [jpg_200 (73.00 %)]              349ns ± 3%              351ns ± 3%    ~           (p=0.573 n=18+20)
BM_ZFlat/4      [pdf (83.30 %)    ]            14.6µs ± 2%             14.7µs ± 4%    ~           (p=0.879 n=19+20)
BM_ZFlat/5      [html4 (22.52 %)  ]             553µs ± 0%              552µs ± 0%  -0.23%        (p=0.000 n=20+20)
BM_ZFlat/6      [txt1 (57.88 %)   ]             540µs ± 0%              540µs ± 0%    ~           (p=0.221 n=20+20)
BM_ZFlat/7      [txt2 (61.91 %)   ]             479µs ± 0%              481µs ± 1%  +0.47%        (p=0.000 n=20+20)
BM_ZFlat/8      [txt3 (54.99 %)   ]             1.44ms ± 0%             1.44ms ± 0%  +0.13%        (p=0.040 n=20+20)
BM_ZFlat/9      [txt4 (66.26 %)   ]             1.97ms ± 0%             1.97ms ± 0%  +0.16%        (p=0.009 n=20+20)
BM_ZFlat/10     [pb (19.68 %)     ]             110µs ± 1%              109µs ± 1%  -0.79%        (p=0.000 n=20+20)
BM_ZFlat/11     [gaviota (37.72 %)]             410µs ± 0%              410µs ± 0%    ~           (p=0.149 n=20+19)
BM_ZFlat/12     [cp (48.12 %)     ]            45.4µs ± 1%             44.9µs ± 1%  -1.23%        (p=0.000 n=20+20)
BM_ZFlat/13     [c (42.47 %)      ]            17.5µs ± 0%             17.5µs ± 1%    ~           (p=0.883 n=20+20)
BM_ZFlat/14     [lsp (48.37 %)    ]            5.51µs ± 1%             5.46µs ± 1%  -0.95%        (p=0.000 n=20+18)
BM_ZFlat/15     [xls (41.23 %)    ]             1.61ms ± 0%             1.62ms ± 0%    ~           (p=0.183 n=20+20)
BM_ZFlat/16     [xls_200 (78.00 %)]              389ns ± 2%              391ns ± 3%    ~           (p=0.740 n=18+20)
BM_ZFlat/17     [bin (18.11 %)    ]             508µs ± 0%              508µs ± 0%    ~           (p=0.779 n=20+20)
BM_ZFlat/18     [bin_200 (7.50 %) ]             87.4ns ± 5%             88.1ns ± 8%    ~           (p=0.367 n=16+19)
BM_ZFlat/19     [sum (48.96 %)    ]            79.1µs ± 0%             80.2µs ± 0%  +1.39%        (p=0.000 n=20+20)
BM_ZFlat/20     [man (59.21 %)    ]            7.55µs ± 1%             7.57µs ± 1%  +0.31%        (p=0.025 n=19+19)

name                                          old speed               new speed               delta
BM_UFlat/0      [html             ]           2.42GB/s ± 0%           2.44GB/s ± 0%  +0.77%        (p=0.000 n=19+19)
BM_UFlat/1      [urls             ]           1.22GB/s ± 0%           1.23GB/s ± 0%  +0.06%        (p=0.000 n=20+19)
BM_UFlat/2      [jpg              ]           17.3GB/s ± 2%           17.2GB/s ± 4%    ~           (p=0.433 n=17+19)
BM_UFlat/3      [jpg_200          ]           1.56GB/s ± 0%           1.54GB/s ± 0%  -0.82%        (p=0.000 n=20+20)
BM_UFlat/4      [pdf              ]           12.5GB/s ± 1%           12.5GB/s ± 1%    ~           (p=0.322 n=17+17)
BM_UFlat/5      [html4            ]           1.85GB/s ± 0%           1.85GB/s ± 0%  +0.16%        (p=0.000 n=20+20)
BM_UFlat/6      [txt1             ]            794MB/s ± 0%            800MB/s ± 0%  +0.68%        (p=0.000 n=18+20)
BM_UFlat/7      [txt2             ]            741MB/s ± 0%            743MB/s ± 0%  +0.30%        (p=0.000 n=19+19)
BM_UFlat/8      [txt3             ]            840MB/s ± 0%            844MB/s ± 0%  +0.53%        (p=0.000 n=18+20)
BM_UFlat/9      [txt4             ]            684MB/s ± 0%            688MB/s ± 0%  +0.57%        (p=0.000 n=20+17)
BM_UFlat/10     [pb               ]           3.04GB/s ± 0%           3.09GB/s ± 0%  +1.60%        (p=0.000 n=19+20)
BM_UFlat/11     [gaviota          ]            977MB/s ± 0%            981MB/s ± 0%  +0.45%        (p=0.000 n=19+19)
BM_UFlat/12     [cp               ]           1.74GB/s ± 0%           1.74GB/s ± 0%  +0.29%        (p=0.000 n=20+19)
BM_UFlat/13     [c                ]           1.53GB/s ± 0%           1.52GB/s ± 1%  -0.56%        (p=0.000 n=19+20)
BM_UFlat/14     [lsp              ]           1.64GB/s ± 0%           1.63GB/s ± 1%  -0.38%        (p=0.000 n=19+20)
BM_UFlat/15     [xls              ]           1.14GB/s ± 0%           1.14GB/s ± 0%  +0.11%        (p=0.000 n=19+20)
BM_UFlat/16     [xls_200          ]            941MB/s ± 1%            931MB/s ± 4%  -1.02%        (p=0.001 n=19+20)
BM_UFlat/17     [bin              ]           1.88GB/s ± 0%           1.87GB/s ± 0%  -0.51%        (p=0.000 n=20+20)
BM_UFlat/18     [bin_200          ]           1.98GB/s ± 0%           1.98GB/s ± 1%    ~           (p=0.767 n=18+18)
BM_UFlat/19     [sum              ]           1.37GB/s ± 0%           1.39GB/s ± 0%  +1.46%        (p=0.000 n=20+20)
BM_UFlat/20     [man              ]           1.43GB/s ± 0%           1.43GB/s ± 0%    ~           (p=0.501 n=18+18)
BM_UValidate/0  [html             ]           3.07GB/s ± 0%           3.00GB/s ± 0%  -2.25%        (p=0.000 n=20+20)
BM_UValidate/1  [urls             ]           1.60GB/s ± 0%           1.59GB/s ± 0%  -0.11%        (p=0.000 n=18+19)
BM_UValidate/2  [jpg              ]            859GB/s ± 0%            848GB/s ± 0%  -1.29%        (p=0.000 n=20+19)
BM_UValidate/3  [jpg_200          ]           2.10GB/s ± 0%           2.09GB/s ± 0%  -0.68%        (p=0.000 n=19+20)
BM_UValidate/4  [pdf              ]           35.9GB/s ± 0%           35.6GB/s ± 1%  -0.71%        (p=0.000 n=20+20)
BM_UIOVec/0     [html             ]            843MB/s ± 0%            844MB/s ± 0%  +0.21%        (p=0.000 n=20+20)
BM_UIOVec/1     [urls             ]            651MB/s ± 0%            650MB/s ± 0%  -0.10%        (p=0.000 n=20+20)
BM_UIOVec/2     [jpg              ]           16.2GB/s ± 6%           16.0GB/s ±10%    ~           (p=0.380 n=19+20)
BM_UIOVec/3     [jpg_200          ]            617MB/s ± 0%            615MB/s ± 0%  -0.24%        (p=0.000 n=20+17)
BM_UIOVec/4     [pdf              ]           8.52GB/s ± 3%           8.50GB/s ± 3%    ~           (p=0.771 n=19+20)
BM_UFlatSink/0  [html             ]           2.42GB/s ± 0%           2.44GB/s ± 0%  +0.93%        (p=0.000 n=20+20)
BM_UFlatSink/1  [urls             ]           1.23GB/s ± 0%           1.23GB/s ± 0%  +0.04%        (p=0.006 n=20+20)
BM_UFlatSink/2  [jpg              ]           16.4GB/s ±14%           16.5GB/s ±13%    ~           (p=0.879 n=19+20)
BM_UFlatSink/3  [jpg_200          ]           1.51GB/s ± 4%           1.51GB/s ± 4%    ~           (p=0.874 n=18+20)
BM_UFlatSink/4  [pdf              ]           12.4GB/s ± 4%           12.3GB/s ± 4%  -1.11%        (p=0.016 n=19+18)
BM_UFlatSink/5  [html4            ]           1.85GB/s ± 0%           1.85GB/s ± 0%  +0.20%        (p=0.000 n=20+20)
BM_UFlatSink/6  [txt1             ]            794MB/s ± 0%            799MB/s ± 0%  +0.72%        (p=0.000 n=19+20)
BM_UFlatSink/7  [txt2             ]            741MB/s ± 0%            743MB/s ± 0%  +0.30%        (p=0.000 n=18+20)
BM_UFlatSink/8  [txt3             ]            839MB/s ± 0%            843MB/s ± 0%  +0.52%        (p=0.000 n=20+18)
BM_UFlatSink/9  [txt4             ]            684MB/s ± 0%            687MB/s ± 0%  +0.46%        (p=0.000 n=20+20)
BM_UFlatSink/10 [pb               ]           3.04GB/s ± 0%           3.09GB/s ± 0%  +1.71%        (p=0.000 n=20+19)
BM_UFlatSink/11 [gaviota          ]            976MB/s ± 0%            980MB/s ± 0%  +0.45%        (p=0.000 n=20+20)
BM_UFlatSink/12 [cp               ]           1.74GB/s ± 1%           1.74GB/s ± 1%    ~           (p=0.904 n=20+20)
BM_UFlatSink/13 [c                ]           1.53GB/s ± 0%           1.53GB/s ± 1%  -0.50%        (p=0.000 n=19+20)
BM_UFlatSink/14 [lsp              ]           1.63GB/s ± 1%           1.63GB/s ± 1%    ~           (p=0.358 n=19+18)
BM_UFlatSink/15 [xls              ]           1.14GB/s ± 0%           1.15GB/s ± 0%  +0.12%        (p=0.000 n=20+20)
BM_UFlatSink/16 [xls_200          ]            931MB/s ± 1%            931MB/s ± 1%    ~           (p=0.686 n=19+19)
BM_UFlatSink/17 [bin              ]           1.88GB/s ± 0%           1.87GB/s ± 0%  -0.53%        (p=0.000 n=20+20)
BM_UFlatSink/18 [bin_200          ]           1.94GB/s ± 2%           1.95GB/s ± 1%  +0.42%        (p=0.014 n=20+15)
BM_UFlatSink/19 [sum              ]           1.37GB/s ± 0%           1.39GB/s ± 0%  +1.38%        (p=0.000 n=19+18)
BM_UFlatSink/20 [man              ]           1.42GB/s ± 1%           1.43GB/s ± 0%    ~           (p=0.284 n=18+19)
BM_ZFlat/0      [html (22.31 %)   ]            815MB/s ± 0%            814MB/s ± 0%  -0.15%        (p=0.000 n=20+20)
BM_ZFlat/1      [urls (47.78 %)   ]            423MB/s ± 0%            422MB/s ± 0%  -0.14%        (p=0.000 n=20+20)
BM_ZFlat/2      [jpg (99.95 %)    ]           10.8GB/s ± 5%           10.6GB/s ± 7%    ~           (p=0.142 n=20+20)
BM_ZFlat/3      [jpg_200 (73.00 %)]            574MB/s ± 2%            572MB/s ± 2%    ~           (p=0.613 n=18+20)
BM_ZFlat/4      [pdf (83.30 %)    ]           7.01GB/s ± 2%           7.01GB/s ± 4%    ~           (p=0.593 n=18+20)
BM_ZFlat/5      [html4 (22.52 %)  ]            743MB/s ± 0%            745MB/s ± 0%  +0.25%        (p=0.000 n=20+19)
BM_ZFlat/6      [txt1 (57.88 %)   ]            283MB/s ± 0%            282MB/s ± 0%    ~           (p=0.261 n=18+19)
BM_ZFlat/7      [txt2 (61.91 %)   ]            262MB/s ± 0%            261MB/s ± 0%  -0.35%        (p=0.000 n=20+19)
BM_ZFlat/8      [txt3 (54.99 %)   ]            298MB/s ± 0%            297MB/s ± 0%  -0.11%        (p=0.000 n=20+19)
BM_ZFlat/9      [txt4 (66.26 %)   ]            245MB/s ± 0%            245MB/s ± 0%  -0.13%        (p=0.000 n=19+20)
BM_ZFlat/10     [pb (19.68 %)     ]           1.08GB/s ± 0%           1.09GB/s ± 0%  +0.82%        (p=0.000 n=18+19)
BM_ZFlat/11     [gaviota (37.72 %)]            451MB/s ± 0%            451MB/s ± 0%  -0.05%        (p=0.004 n=19+20)
BM_ZFlat/12     [cp (48.12 %)     ]            543MB/s ± 1%            550MB/s ± 1%  +1.24%        (p=0.000 n=20+20)
BM_ZFlat/13     [c (42.47 %)      ]            638MB/s ± 0%            637MB/s ± 0%    ~           (p=0.708 n=19+19)
BM_ZFlat/14     [lsp (48.37 %)    ]            678MB/s ± 2%            684MB/s ± 1%  +0.89%        (p=0.000 n=20+19)
BM_ZFlat/15     [xls (41.23 %)    ]            640MB/s ± 0%            640MB/s ± 0%  -0.10%        (p=0.000 n=19+19)
BM_ZFlat/16     [xls_200 (78.00 %)]            515MB/s ± 2%            514MB/s ± 3%    ~           (p=0.916 n=18+19)
BM_ZFlat/17     [bin (18.11 %)    ]           1.01GB/s ± 0%           1.01GB/s ± 0%  +0.03%        (p=0.033 n=20+20)
BM_ZFlat/18     [bin_200 (7.50 %) ]           2.30GB/s ± 6%           2.28GB/s ± 9%    ~           (p=0.502 n=16+19)
BM_ZFlat/19     [sum (48.96 %)    ]            485MB/s ± 0%            478MB/s ± 0%  -1.39%        (p=0.000 n=19+20)
BM_ZFlat/20     [man (59.21 %)    ]            562MB/s ± 1%            560MB/s ± 1%  -0.37%        (p=0.016 n=18+19)
This commit is contained in:
costan 2019-01-08 11:31:10 -08:00 committed by Victor Costan
parent 4f0adca400
commit 97a20b480f

View file

@ -732,17 +732,13 @@ static inline uint32 ExtractLowBytes(uint32 v, int n) {
#endif #endif
} }
static inline bool LeftShiftOverflows(uint32 value, uint32 shift) { static inline bool LeftShiftOverflows(uint8 value, uint32 shift) {
DCHECK_LT(shift, 32); DCHECK_LT(shift, 32);
static const uint32 masks[] = { static const uint8 masks[] = {
0x00000000, 0x80000000, 0xc0000000, 0xe0000000, // 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, //
0xf0000000, 0xf8000000, 0xfc000000, 0xfe000000, // 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, //
0xff000000, 0xff800000, 0xffc00000, 0xffe00000, // 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, //
0xfff00000, 0xfff80000, 0xfffc0000, 0xfffe0000, // 0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe};
0xffff0000, 0xffff8000, 0xffffc000, 0xffffe000, //
0xfffff000, 0xfffff800, 0xfffffc00, 0xfffffe00, //
0xffffff00, 0xffffff80, 0xffffffc0, 0xffffffe0, //
0xfffffff0, 0xfffffff8, 0xfffffffc, 0xfffffffe};
return (value & masks[shift]) != 0; return (value & masks[shift]) != 0;
} }
@ -798,7 +794,7 @@ class SnappyDecompressor {
const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip)); const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip));
reader_->Skip(1); reader_->Skip(1);
uint32 val = c & 0x7f; uint32 val = c & 0x7f;
if (LeftShiftOverflows(val, shift)) return false; if (LeftShiftOverflows(static_cast<uint8>(val), shift)) return false;
*result |= val << shift; *result |= val << shift;
if (c < 128) { if (c < 128) {
break; break;