mirror of https://github.com/facebook/rocksdb.git
More minor HCC refactoring + typed mmap (#11670)
Summary: More code leading up to dynamic HCC. * Small enhancements to cache_bench * Extra assertion in Unref * Improve a CAS loop in ChargeUsageMaybeEvictStrict * Put load factor constants in appropriate class * Move `standalone` field to HyperClockTable::HandleImpl because it can be encoded differently in the upcoming dynamic HCC. * Add a typed version of MemMapping to simplify some future code. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11670 Test Plan: existing tests, unit test added for TypedMemMapping Reviewed By: jowlyzhang Differential Revision: D48056464 Pulled By: pdillinger fbshipit-source-id: 186b7d3105c5d6d2eb6a592369bc10a97ee14a15
This commit is contained in:
parent
4500a0d6ec
commit
cdb11f5ce6
|
@ -436,6 +436,10 @@ class CacheBench {
|
||||||
|
|
||||||
printf("Lookup hit ratio: %g\n", shared.GetLookupHitRatio());
|
printf("Lookup hit ratio: %g\n", shared.GetLookupHitRatio());
|
||||||
|
|
||||||
|
size_t occ = cache_->GetOccupancyCount();
|
||||||
|
size_t slot = cache_->GetTableAddressCount();
|
||||||
|
printf("Final load factor: %g (%zu / %zu)\n", 1.0 * occ / slot, occ, slot);
|
||||||
|
|
||||||
if (FLAGS_histograms) {
|
if (FLAGS_histograms) {
|
||||||
printf("\nOperation latency (ns):\n");
|
printf("\nOperation latency (ns):\n");
|
||||||
HistogramImpl combined;
|
HistogramImpl combined;
|
||||||
|
@ -676,6 +680,7 @@ class CacheBench {
|
||||||
#endif
|
#endif
|
||||||
printf("----------------------------\n");
|
printf("----------------------------\n");
|
||||||
printf("RocksDB version : %d.%d\n", kMajorVersion, kMinorVersion);
|
printf("RocksDB version : %d.%d\n", kMajorVersion, kMinorVersion);
|
||||||
|
printf("Cache impl name : %s\n", cache_->Name());
|
||||||
printf("DMutex impl name : %s\n", DMutex::kName());
|
printf("DMutex impl name : %s\n", DMutex::kName());
|
||||||
printf("Number of threads : %u\n", FLAGS_threads);
|
printf("Number of threads : %u\n", FLAGS_threads);
|
||||||
printf("Ops per thread : %" PRIu64 "\n", FLAGS_ops_per_thread);
|
printf("Ops per thread : %" PRIu64 "\n", FLAGS_ops_per_thread);
|
||||||
|
|
|
@ -79,8 +79,10 @@ inline void Unref(const ClockHandle& h, uint64_t count = 1) {
|
||||||
// Pretend we never took the reference
|
// Pretend we never took the reference
|
||||||
// WART: there's a tiny chance we release last ref to invisible
|
// WART: there's a tiny chance we release last ref to invisible
|
||||||
// entry here. If that happens, we let eviction take care of it.
|
// entry here. If that happens, we let eviction take care of it.
|
||||||
h.meta.fetch_sub(ClockHandle::kAcquireIncrement * count,
|
uint64_t old_meta = h.meta.fetch_sub(ClockHandle::kAcquireIncrement * count,
|
||||||
std::memory_order_release);
|
std::memory_order_release);
|
||||||
|
assert(GetRefcount(old_meta) != 0);
|
||||||
|
(void)old_meta;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline bool ClockUpdate(ClockHandle& h) {
|
inline bool ClockUpdate(ClockHandle& h) {
|
||||||
|
@ -406,14 +408,14 @@ Status BaseClockTable::ChargeUsageMaybeEvictStrict(
|
||||||
// Grab any available capacity, and free up any more required.
|
// Grab any available capacity, and free up any more required.
|
||||||
size_t old_usage = usage_.load(std::memory_order_relaxed);
|
size_t old_usage = usage_.load(std::memory_order_relaxed);
|
||||||
size_t new_usage;
|
size_t new_usage;
|
||||||
if (LIKELY(old_usage != capacity)) {
|
|
||||||
do {
|
do {
|
||||||
new_usage = std::min(capacity, old_usage + total_charge);
|
new_usage = std::min(capacity, old_usage + total_charge);
|
||||||
|
if (new_usage == old_usage) {
|
||||||
|
// No change needed
|
||||||
|
break;
|
||||||
|
}
|
||||||
} while (!usage_.compare_exchange_weak(old_usage, new_usage,
|
} while (!usage_.compare_exchange_weak(old_usage, new_usage,
|
||||||
std::memory_order_relaxed));
|
std::memory_order_relaxed));
|
||||||
} else {
|
|
||||||
new_usage = old_usage;
|
|
||||||
}
|
|
||||||
// How much do we need to evict then?
|
// How much do we need to evict then?
|
||||||
size_t need_evict_charge = old_usage + total_charge - new_usage;
|
size_t need_evict_charge = old_usage + total_charge - new_usage;
|
||||||
size_t request_evict_charge = need_evict_charge;
|
size_t request_evict_charge = need_evict_charge;
|
||||||
|
@ -1418,7 +1420,7 @@ void AddShardEvaluation(const HyperClockCache::Shard& shard,
|
||||||
// If filled to capacity, what would the occupancy ratio be?
|
// If filled to capacity, what would the occupancy ratio be?
|
||||||
double ratio = occ_ratio / usage_ratio;
|
double ratio = occ_ratio / usage_ratio;
|
||||||
// Given max load factor, what that load factor be?
|
// Given max load factor, what that load factor be?
|
||||||
double lf = ratio * kStrictLoadFactor;
|
double lf = ratio * HyperClockTable::kStrictLoadFactor;
|
||||||
predicted_load_factors.push_back(lf);
|
predicted_load_factors.push_back(lf);
|
||||||
|
|
||||||
// Update min_recommendation also
|
// Update min_recommendation also
|
||||||
|
@ -1457,17 +1459,18 @@ void HyperClockCache::ReportProblems(
|
||||||
predicted_load_factors.end(), 0.0) /
|
predicted_load_factors.end(), 0.0) /
|
||||||
shard_count;
|
shard_count;
|
||||||
|
|
||||||
constexpr double kLowSpecLoadFactor = kLoadFactor / 2;
|
constexpr double kLowSpecLoadFactor = HyperClockTable::kLoadFactor / 2;
|
||||||
constexpr double kMidSpecLoadFactor = kLoadFactor / 1.414;
|
constexpr double kMidSpecLoadFactor = HyperClockTable::kLoadFactor / 1.414;
|
||||||
if (average_load_factor > kLoadFactor) {
|
if (average_load_factor > HyperClockTable::kLoadFactor) {
|
||||||
// Out of spec => Consider reporting load factor too high
|
// Out of spec => Consider reporting load factor too high
|
||||||
// Estimate effective overall capacity loss due to enforcing occupancy limit
|
// Estimate effective overall capacity loss due to enforcing occupancy limit
|
||||||
double lost_portion = 0.0;
|
double lost_portion = 0.0;
|
||||||
int over_count = 0;
|
int over_count = 0;
|
||||||
for (double lf : predicted_load_factors) {
|
for (double lf : predicted_load_factors) {
|
||||||
if (lf > kStrictLoadFactor) {
|
if (lf > HyperClockTable::kStrictLoadFactor) {
|
||||||
++over_count;
|
++over_count;
|
||||||
lost_portion += (lf - kStrictLoadFactor) / lf / shard_count;
|
lost_portion +=
|
||||||
|
(lf - HyperClockTable::kStrictLoadFactor) / lf / shard_count;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// >= 20% loss -> error
|
// >= 20% loss -> error
|
||||||
|
|
|
@ -282,29 +282,6 @@ class ClockCacheTest;
|
||||||
|
|
||||||
// ----------------------------------------------------------------------- //
|
// ----------------------------------------------------------------------- //
|
||||||
|
|
||||||
// The load factor p is a real number in (0, 1) such that at all
|
|
||||||
// times at most a fraction p of all slots, without counting tombstones,
|
|
||||||
// are occupied by elements. This means that the probability that a random
|
|
||||||
// probe hits an occupied slot is at most p, and thus at most 1/p probes
|
|
||||||
// are required on average. For example, p = 70% implies that between 1 and 2
|
|
||||||
// probes are needed on average (bear in mind that this reasoning doesn't
|
|
||||||
// consider the effects of clustering over time, which should be negligible
|
|
||||||
// with double hashing).
|
|
||||||
// Because the size of the hash table is always rounded up to the next
|
|
||||||
// power of 2, p is really an upper bound on the actual load factor---the
|
|
||||||
// actual load factor is anywhere between p/2 and p. This is a bit wasteful,
|
|
||||||
// but bear in mind that slots only hold metadata, not actual values.
|
|
||||||
// Since space cost is dominated by the values (the LSM blocks),
|
|
||||||
// overprovisioning the table with metadata only increases the total cache space
|
|
||||||
// usage by a tiny fraction.
|
|
||||||
constexpr double kLoadFactor = 0.7;
|
|
||||||
|
|
||||||
// The user can exceed kLoadFactor if the sizes of the inserted values don't
|
|
||||||
// match estimated_value_size, or in some rare cases with
|
|
||||||
// strict_capacity_limit == false. To avoid degenerate performance, we set a
|
|
||||||
// strict upper bound on the load factor.
|
|
||||||
constexpr double kStrictLoadFactor = 0.84;
|
|
||||||
|
|
||||||
struct ClockHandleBasicData {
|
struct ClockHandleBasicData {
|
||||||
Cache::ObjectPtr value = nullptr;
|
Cache::ObjectPtr value = nullptr;
|
||||||
const Cache::CacheItemHelper* helper = nullptr;
|
const Cache::CacheItemHelper* helper = nullptr;
|
||||||
|
@ -374,17 +351,6 @@ struct ClockHandle : public ClockHandleBasicData {
|
||||||
|
|
||||||
// See above. Mutable for read reference counting.
|
// See above. Mutable for read reference counting.
|
||||||
mutable std::atomic<uint64_t> meta{};
|
mutable std::atomic<uint64_t> meta{};
|
||||||
|
|
||||||
// Whether this is a "deteched" handle that is independently allocated
|
|
||||||
// with `new` (so must be deleted with `delete`).
|
|
||||||
// TODO: ideally this would be packed into some other data field, such
|
|
||||||
// as upper bits of total_charge, but that incurs a measurable performance
|
|
||||||
// regression.
|
|
||||||
bool standalone = false;
|
|
||||||
|
|
||||||
inline bool IsStandalone() const { return standalone; }
|
|
||||||
|
|
||||||
inline void SetStandalone() { standalone = true; }
|
|
||||||
}; // struct ClockHandle
|
}; // struct ClockHandle
|
||||||
|
|
||||||
class BaseClockTable {
|
class BaseClockTable {
|
||||||
|
@ -476,6 +442,7 @@ class BaseClockTable {
|
||||||
// Clock algorithm sweep pointer.
|
// Clock algorithm sweep pointer.
|
||||||
std::atomic<uint64_t> clock_pointer_{};
|
std::atomic<uint64_t> clock_pointer_{};
|
||||||
|
|
||||||
|
// TODO: is this separation needed if we don't do background evictions?
|
||||||
ALIGN_AS(CACHE_LINE_SIZE)
|
ALIGN_AS(CACHE_LINE_SIZE)
|
||||||
// Number of elements in the table.
|
// Number of elements in the table.
|
||||||
std::atomic<size_t> occupancy_{};
|
std::atomic<size_t> occupancy_{};
|
||||||
|
@ -508,6 +475,16 @@ class HyperClockTable : public BaseClockTable {
|
||||||
// up in this slot or a higher one.
|
// up in this slot or a higher one.
|
||||||
std::atomic<uint32_t> displacements{};
|
std::atomic<uint32_t> displacements{};
|
||||||
|
|
||||||
|
// Whether this is a "deteched" handle that is independently allocated
|
||||||
|
// with `new` (so must be deleted with `delete`).
|
||||||
|
// TODO: ideally this would be packed into some other data field, such
|
||||||
|
// as upper bits of total_charge, but that incurs a measurable performance
|
||||||
|
// regression.
|
||||||
|
bool standalone = false;
|
||||||
|
|
||||||
|
inline bool IsStandalone() const { return standalone; }
|
||||||
|
|
||||||
|
inline void SetStandalone() { standalone = true; }
|
||||||
}; // struct HandleImpl
|
}; // struct HandleImpl
|
||||||
|
|
||||||
struct Opts {
|
struct Opts {
|
||||||
|
@ -561,6 +538,29 @@ class HyperClockTable : public BaseClockTable {
|
||||||
void TEST_ReleaseN(HandleImpl* handle, size_t n);
|
void TEST_ReleaseN(HandleImpl* handle, size_t n);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
// The load factor p is a real number in (0, 1) such that at all
|
||||||
|
// times at most a fraction p of all slots, without counting tombstones,
|
||||||
|
// are occupied by elements. This means that the probability that a random
|
||||||
|
// probe hits an occupied slot is at most p, and thus at most 1/p probes
|
||||||
|
// are required on average. For example, p = 70% implies that between 1 and 2
|
||||||
|
// probes are needed on average (bear in mind that this reasoning doesn't
|
||||||
|
// consider the effects of clustering over time, which should be negligible
|
||||||
|
// with double hashing).
|
||||||
|
// Because the size of the hash table is always rounded up to the next
|
||||||
|
// power of 2, p is really an upper bound on the actual load factor---the
|
||||||
|
// actual load factor is anywhere between p/2 and p. This is a bit wasteful,
|
||||||
|
// but bear in mind that slots only hold metadata, not actual values.
|
||||||
|
// Since space cost is dominated by the values (the LSM blocks),
|
||||||
|
// overprovisioning the table with metadata only increases the total cache
|
||||||
|
// space usage by a tiny fraction.
|
||||||
|
static constexpr double kLoadFactor = 0.7;
|
||||||
|
|
||||||
|
// The user can exceed kLoadFactor if the sizes of the inserted values don't
|
||||||
|
// match estimated_value_size, or in some rare cases with
|
||||||
|
// strict_capacity_limit == false. To avoid degenerate performance, we set a
|
||||||
|
// strict upper bound on the load factor.
|
||||||
|
static constexpr double kStrictLoadFactor = 0.84;
|
||||||
|
|
||||||
private: // functions
|
private: // functions
|
||||||
// Returns x mod 2^{length_bits_}.
|
// Returns x mod 2^{length_bits_}.
|
||||||
inline size_t ModTableSize(uint64_t x) {
|
inline size_t ModTableSize(uint64_t x) {
|
||||||
|
|
|
@ -915,8 +915,10 @@ TEST_F(ClockCacheTest, TableSizesTest) {
|
||||||
/*memory_allocator*/ nullptr, kDontChargeCacheMetadata)
|
/*memory_allocator*/ nullptr, kDontChargeCacheMetadata)
|
||||||
.MakeSharedCache();
|
.MakeSharedCache();
|
||||||
// Table sizes are currently only powers of two
|
// Table sizes are currently only powers of two
|
||||||
EXPECT_GE(cache->GetTableAddressCount(), est_count / kLoadFactor);
|
EXPECT_GE(cache->GetTableAddressCount(),
|
||||||
EXPECT_LE(cache->GetTableAddressCount(), est_count / kLoadFactor * 2.0);
|
est_count / HyperClockTable::kLoadFactor);
|
||||||
|
EXPECT_LE(cache->GetTableAddressCount(),
|
||||||
|
est_count / HyperClockTable::kLoadFactor * 2.0);
|
||||||
EXPECT_EQ(cache->GetUsage(), 0);
|
EXPECT_EQ(cache->GetUsage(), 0);
|
||||||
|
|
||||||
// kFullChargeMetaData
|
// kFullChargeMetaData
|
||||||
|
@ -933,9 +935,9 @@ TEST_F(ClockCacheTest, TableSizesTest) {
|
||||||
double est_count_after_meta =
|
double est_count_after_meta =
|
||||||
(capacity - cache->GetUsage()) * 1.0 / est_val_size;
|
(capacity - cache->GetUsage()) * 1.0 / est_val_size;
|
||||||
EXPECT_GE(cache->GetTableAddressCount(),
|
EXPECT_GE(cache->GetTableAddressCount(),
|
||||||
est_count_after_meta / kLoadFactor);
|
est_count_after_meta / HyperClockTable::kLoadFactor);
|
||||||
EXPECT_LE(cache->GetTableAddressCount(),
|
EXPECT_LE(cache->GetTableAddressCount(),
|
||||||
est_count_after_meta / kLoadFactor * 2.0);
|
est_count_after_meta / HyperClockTable::kLoadFactor * 2.0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -219,21 +219,28 @@ size_t PopMinorPageFaultCount() {
|
||||||
|
|
||||||
TEST(MmapTest, AllocateLazyZeroed) {
|
TEST(MmapTest, AllocateLazyZeroed) {
|
||||||
// Doesn't have to be page aligned
|
// Doesn't have to be page aligned
|
||||||
constexpr size_t len = 1234567;
|
constexpr size_t len = 1234567; // in bytes
|
||||||
MemMapping m = MemMapping::AllocateLazyZeroed(len);
|
constexpr size_t count = len / 8; // in uint64_t objects
|
||||||
auto arr = static_cast<char*>(m.Get());
|
// Implicit conversion move
|
||||||
|
TypedMemMapping<uint64_t> pre_arr = MemMapping::AllocateLazyZeroed(len);
|
||||||
|
// Move from same type
|
||||||
|
TypedMemMapping<uint64_t> arr = std::move(pre_arr);
|
||||||
|
|
||||||
// Should generally work
|
ASSERT_NE(arr.Get(), nullptr);
|
||||||
ASSERT_NE(arr, nullptr);
|
ASSERT_EQ(arr.Get(), &arr[0]);
|
||||||
|
ASSERT_EQ(arr.Get(), arr.MemMapping::Get());
|
||||||
|
|
||||||
|
ASSERT_EQ(arr.Length(), len);
|
||||||
|
ASSERT_EQ(arr.Count(), count);
|
||||||
|
|
||||||
// Start counting page faults
|
// Start counting page faults
|
||||||
PopMinorPageFaultCount();
|
PopMinorPageFaultCount();
|
||||||
|
|
||||||
// Access half of the allocation
|
// Access half of the allocation
|
||||||
size_t i = 0;
|
size_t i = 0;
|
||||||
for (; i < len / 2; ++i) {
|
for (; i < count / 2; ++i) {
|
||||||
ASSERT_EQ(arr[i], 0);
|
ASSERT_EQ(arr[i], 0);
|
||||||
arr[i] = static_cast<char>(i & 255);
|
arr[i] = i;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Appropriate page faults (maybe more)
|
// Appropriate page faults (maybe more)
|
||||||
|
@ -241,9 +248,9 @@ TEST(MmapTest, AllocateLazyZeroed) {
|
||||||
ASSERT_GE(faults, len / 2 / port::kPageSize);
|
ASSERT_GE(faults, len / 2 / port::kPageSize);
|
||||||
|
|
||||||
// Access rest of the allocation
|
// Access rest of the allocation
|
||||||
for (; i < len; ++i) {
|
for (; i < count; ++i) {
|
||||||
ASSERT_EQ(arr[i], 0);
|
ASSERT_EQ(arr[i], 0);
|
||||||
arr[i] = static_cast<char>(i & 255);
|
arr[i] = i;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Appropriate page faults (maybe more)
|
// Appropriate page faults (maybe more)
|
||||||
|
@ -251,8 +258,8 @@ TEST(MmapTest, AllocateLazyZeroed) {
|
||||||
ASSERT_GE(faults, len / 2 / port::kPageSize);
|
ASSERT_GE(faults, len / 2 / port::kPageSize);
|
||||||
|
|
||||||
// Verify data
|
// Verify data
|
||||||
for (i = 0; i < len; ++i) {
|
for (i = 0; i < count; ++i) {
|
||||||
ASSERT_EQ(arr[i], static_cast<char>(i & 255));
|
ASSERT_EQ(arr[i], i);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
20
port/mmap.h
20
port/mmap.h
|
@ -14,6 +14,7 @@
|
||||||
#endif // OS_WIN
|
#endif // OS_WIN
|
||||||
|
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
|
#include <utility>
|
||||||
|
|
||||||
#include "rocksdb/rocksdb_namespace.h"
|
#include "rocksdb/rocksdb_namespace.h"
|
||||||
|
|
||||||
|
@ -67,4 +68,23 @@ class MemMapping {
|
||||||
static MemMapping AllocateAnonymous(size_t length, bool huge);
|
static MemMapping AllocateAnonymous(size_t length, bool huge);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Simple MemMapping wrapper that presents the memory as an array of T.
|
||||||
|
// For example,
|
||||||
|
// TypedMemMapping<uint64_t> arr = MemMapping::AllocateLazyZeroed(num_bytes);
|
||||||
|
template <typename T>
|
||||||
|
class TypedMemMapping : public MemMapping {
|
||||||
|
public:
|
||||||
|
/*implicit*/ TypedMemMapping(MemMapping&& v) noexcept
|
||||||
|
: MemMapping(std::move(v)) {}
|
||||||
|
TypedMemMapping& operator=(MemMapping&& v) noexcept {
|
||||||
|
MemMapping& base = *this;
|
||||||
|
base = std::move(v);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline T* Get() const { return static_cast<T*>(MemMapping::Get()); }
|
||||||
|
inline size_t Count() const { return MemMapping::Length() / sizeof(T); }
|
||||||
|
|
||||||
|
inline T& operator[](size_t index) const { return Get()[index]; }
|
||||||
|
};
|
||||||
|
|
||||||
} // namespace ROCKSDB_NAMESPACE
|
} // namespace ROCKSDB_NAMESPACE
|
||||||
|
|
Loading…
Reference in New Issue