mirror of https://github.com/google/snappy.git
decompression: optimize ExtractOffset for Arm
Inspired by kExtractMasksCombined, this patch uses shift to replace table lookup. On Arm the codegen is 2 shift ops (lsl+lsr). Comparing to previous ldr which requires 4 cycles latency, the lsl+lsr only need 2 cycles. Slight (~0.3%) uplift observed on N1, and ~3% on A72. Signed-off-by: Jun He <jun.he@arm.com> Change-Id: I5b53632d22d9e5cf1a49d0c5cdd16265a15de23b
This commit is contained in:
parent
f2db8f77ce
commit
f52721b2b4
|
@ -1081,6 +1081,9 @@ inline uint32_t ExtractOffset(uint32_t val, size_t tag_type) {
|
|||
reinterpret_cast<const char*>(&kExtractMasksCombined) + 2 * tag_type,
|
||||
sizeof(result));
|
||||
return val & result;
|
||||
#elif defined(__aarch64__)
|
||||
constexpr uint64_t kExtractMasksCombined = 0x0000FFFF00FF0000ull;
|
||||
return val & (uint32_t)((kExtractMasksCombined >> (tag_type * 16)) & 0xFFFF);
|
||||
#else
|
||||
static constexpr uint32_t kExtractMasks[4] = {0, 0xFF, 0xFFFF, 0};
|
||||
return val & kExtractMasks[tag_type];
|
||||
|
|
Loading…
Reference in New Issue