From f52721b2b4e4949d26fa51572e3ad9000e1ed3be Mon Sep 17 00:00:00 2001 From: Jun He Date: Fri, 6 Aug 2021 14:46:53 +0800 Subject: [PATCH] decompression: optimize ExtractOffset for Arm Inspired by kExtractMasksCombined, this patch uses shift to replace table lookup. On Arm the codegen is 2 shift ops (lsl+lsr). Comparing to previous ldr which requires 4 cycles latency, the lsl+lsr only need 2 cycles. Slight (~0.3%) uplift observed on N1, and ~3% on A72. Signed-off-by: Jun He Change-Id: I5b53632d22d9e5cf1a49d0c5cdd16265a15de23b --- snappy.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/snappy.cc b/snappy.cc index 3f446c6..4931104 100644 --- a/snappy.cc +++ b/snappy.cc @@ -1081,6 +1081,9 @@ inline uint32_t ExtractOffset(uint32_t val, size_t tag_type) { reinterpret_cast(&kExtractMasksCombined) + 2 * tag_type, sizeof(result)); return val & result; +#elif defined(__aarch64__) + constexpr uint64_t kExtractMasksCombined = 0x0000FFFF00FF0000ull; + return val & (uint32_t)((kExtractMasksCombined >> (tag_type * 16)) & 0xFFFF); #else static constexpr uint32_t kExtractMasks[4] = {0, 0xFF, 0xFFFF, 0}; return val & kExtractMasks[tag_type];