From b550fc0b090f63e95b567d34bdc3be258fbfe43a Mon Sep 17 00:00:00 2001 From: Ali Saidi Date: Tue, 14 Jun 2022 17:58:44 -0700 Subject: [PATCH] Modify the instructions emited for PREFETCH on arm64 (#10117) Summary: __builtin_prefetch(...., 1) prefetches into the L2 cache on x86 while the same emits a pldl3keep instruction on arm64 which doesn't seem to be close enough. Testing on a Graviton3, and M1 system with memtablerep_bench fillrandom and skiplist througpuh increased as follows adjusting the 1 to 2 or 3: ``` 1 -> 2 1 -> 3 ---------------------------- Graviton3 +10% +15% M1 +10% +10% ``` Given that prefetching into the L1 cache seems to help, I chose that conversion Pull Request resolved: https://github.com/facebook/rocksdb/pull/10117 Reviewed By: pdillinger Differential Revision: D37120475 fbshipit-source-id: db1ef43f941445019c68316500a2250acc643d5e --- port/port_posix.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/port/port_posix.h b/port/port_posix.h index b175c13ac8..01a1a28a05 100644 --- a/port/port_posix.h +++ b/port/port_posix.h @@ -202,7 +202,16 @@ extern void *cacheline_aligned_alloc(size_t size); extern void cacheline_aligned_free(void *memblock); +#if defined(__aarch64__) +// __builtin_prefetch(..., 1) turns into a prefetch into prfm pldl3keep. On +// arm64 we want this as close to the core as possible to turn it into a +// L1 prefetech unless locality == 0 in which case it will be turned into a +// non-temporal prefetch +#define PREFETCH(addr, rw, locality) \ + __builtin_prefetch(addr, rw, locality >= 1 ? 3 : locality) +#else #define PREFETCH(addr, rw, locality) __builtin_prefetch(addr, rw, locality) +#endif extern void Crash(const std::string& srcfile, int srcline);