rocksdb/memtable/skiplistrep.cc

408 lines
14 KiB
C++
Raw Normal View History

// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
//
Memtable sampling for mempurge heuristic. (#8628) Summary: Changes the API of the MemPurge process: the `bool experimental_allow_mempurge` and `experimental_mempurge_policy` flags have been replaced by a `double experimental_mempurge_threshold` option. This change of API reflects another major change introduced in this PR: the MemPurgeDecider() function now works by sampling the memtables being flushed to estimate the overall amount of useful payload (payload minus the garbage), and then compare this useful payload estimate with the `double experimental_mempurge_threshold` value. Therefore, when the value of this flag is `0.0` (default value), mempurge is simply deactivated. On the other hand, a value of `DBL_MAX` would be equivalent to always going through a mempurge regardless of the garbage ratio estimate. At the moment, a `double experimental_mempurge_threshold` value else than 0.0 or `DBL_MAX` is opnly supported`with the `SkipList` memtable representation. Regarding the sampling, this PR includes the introduction of a `MemTable::UniqueRandomSample` function that collects (approximately) random entries from the memtable by using the new `SkipList::Iterator::RandomSeek()` under the hood, or by iterating through each memtable entry, depending on the target sample size and the total number of entries. The unit tests have been readapted to support this new API. Pull Request resolved: https://github.com/facebook/rocksdb/pull/8628 Reviewed By: pdillinger Differential Revision: D30149315 Pulled By: bjlemaire fbshipit-source-id: 1feef5390c95db6f4480ab4434716533d3947f27
2021-08-11 01:07:48 +00:00
#include <random>
#include "db/memtable.h"
#include "memory/arena.h"
#include "memtable/inlineskiplist.h"
2015-11-19 22:24:29 +00:00
#include "rocksdb/memtablerep.h"
#include "rocksdb/utilities/options_type.h"
#include "util/string_util.h"
namespace ROCKSDB_NAMESPACE {
namespace {
class SkipListRep : public MemTableRep {
2015-11-19 22:24:29 +00:00
InlineSkipList<const MemTableRep::KeyComparator&> skip_list_;
SkipListRep::LookaheadIterator Summary: This diff introduces the `lookahead` argument to `SkipListFactory()`. This is an optimization for the tailing use case which includes many seeks. E.g. consider the following operations on a skip list iterator: Seek(x), Next(), Next(), Seek(x+2), Next(), Seek(x+3), Next(), Next(), ... If `lookahead` is positive, `SkipListRep` will return an iterator which also keeps track of the previously visited node. Seek() then first does a linear search starting from that node (up to `lookahead` steps). As in the tailing example above, this may require fewer than ~log(n) comparisons as with regular skip list search. Test Plan: Added a new benchmark (`fillseekseq`) which simulates the usage pattern. It first writes N records (with consecutive keys), then measures how much time it takes to read them by calling `Seek()` and `Next()`. $ time ./db_bench -num 10000000 -benchmarks fillseekseq -prefix_size 1 \ -key_size 8 -write_buffer_size $[1024*1024*1024] -value_size 50 \ -seekseq_next 2 -skip_list_lookahead=0 [...] DB path: [/dev/shm/rocksdbtest/dbbench] fillseekseq : 0.389 micros/op 2569047 ops/sec; real 0m21.806s user 0m12.106s sys 0m9.672s $ time ./db_bench [...] -skip_list_lookahead=2 [...] DB path: [/dev/shm/rocksdbtest/dbbench] fillseekseq : 0.153 micros/op 6540684 ops/sec; real 0m19.469s user 0m10.192s sys 0m9.252s Reviewers: ljin, sdong, igor Reviewed By: igor Subscribers: dhruba, leveldb, march, lovro Differential Revision: https://reviews.facebook.net/D23997
2014-09-23 22:52:28 +00:00
const MemTableRep::KeyComparator& cmp_;
const SliceTransform* transform_;
const size_t lookahead_;
friend class LookaheadIterator;
public:
explicit SkipListRep(const MemTableRep::KeyComparator& compare,
Allocator* allocator, const SliceTransform* transform,
const size_t lookahead)
: MemTableRep(allocator),
skip_list_(compare, allocator),
cmp_(compare),
transform_(transform),
lookahead_(lookahead) {}
KeyHandle Allocate(const size_t len, char** buf) override {
*buf = skip_list_.AllocateKey(len);
return static_cast<KeyHandle>(*buf);
}
2015-11-19 22:24:29 +00:00
// Insert key into the list.
// REQUIRES: nothing that compares equal to key is currently in the list.
void Insert(KeyHandle handle) override {
skip_list_.Insert(static_cast<char*>(handle));
}
bool InsertKey(KeyHandle handle) override {
return skip_list_.Insert(static_cast<char*>(handle));
}
void InsertWithHint(KeyHandle handle, void** hint) override {
skip_list_.InsertWithHint(static_cast<char*>(handle), hint);
}
bool InsertKeyWithHint(KeyHandle handle, void** hint) override {
return skip_list_.InsertWithHint(static_cast<char*>(handle), hint);
}
void InsertWithHintConcurrently(KeyHandle handle, void** hint) override {
skip_list_.InsertWithHintConcurrently(static_cast<char*>(handle), hint);
}
bool InsertKeyWithHintConcurrently(KeyHandle handle, void** hint) override {
return skip_list_.InsertWithHintConcurrently(static_cast<char*>(handle),
hint);
}
void InsertConcurrently(KeyHandle handle) override {
skip_list_.InsertConcurrently(static_cast<char*>(handle));
}
bool InsertKeyConcurrently(KeyHandle handle) override {
return skip_list_.InsertConcurrently(static_cast<char*>(handle));
}
support for concurrent adds to memtable Summary: This diff adds support for concurrent adds to the skiplist memtable implementations. Memory allocation is made thread-safe by the addition of a spinlock, with small per-core buffers to avoid contention. Concurrent memtable writes are made via an additional method and don't impose a performance overhead on the non-concurrent case, so parallelism can be selected on a per-batch basis. Write thread synchronization is an increasing bottleneck for higher levels of concurrency, so this diff adds --enable_write_thread_adaptive_yield (default off). This feature causes threads joining a write batch group to spin for a short time (default 100 usec) using sched_yield, rather than going to sleep on a mutex. If the timing of the yield calls indicates that another thread has actually run during the yield then spinning is avoided. This option improves performance for concurrent situations even without parallel adds, although it has the potential to increase CPU usage (and the heuristic adaptation is not yet mature). Parallel writes are not currently compatible with inplace updates, update callbacks, or delete filtering. Enable it with --allow_concurrent_memtable_write (and --enable_write_thread_adaptive_yield). Parallel memtable writes are performance neutral when there is no actual parallelism, and in my experiments (SSD server-class Linux and varying contention and key sizes for fillrandom) they are always a performance win when there is more than one thread. Statistics are updated earlier in the write path, dropping the number of DB mutex acquisitions from 2 to 1 for almost all cases. This diff was motivated and inspired by Yahoo's cLSM work. It is more conservative than cLSM: RocksDB's write batch group leader role is preserved (along with all of the existing flush and write throttling logic) and concurrent writers are blocked until all memtable insertions have completed and the sequence number has been advanced, to preserve linearizability. My test config is "db_bench -benchmarks=fillrandom -threads=$T -batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T -level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999 -disable_auto_compactions --max_write_buffer_number=8 -max_background_flushes=8 --disable_wal --write_buffer_size=160000000 --block_size=16384 --allow_concurrent_memtable_write" on a two-socket Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1 thread I get ~440Kops/sec. Peak performance for 1 socket (numactl -N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance across both sockets happens at 30 threads, and is ~900Kops/sec, although with fewer threads there is less performance loss when the system has background work. Test Plan: 1. concurrent stress tests for InlineSkipList and DynamicBloom 2. make clean; make check 3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench 4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench 5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench 6. make clean; OPT=-DROCKSDB_LITE make check 7. verify no perf regressions when disabled Reviewers: igor, sdong Reviewed By: sdong Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba Differential Revision: https://reviews.facebook.net/D50589
2015-08-14 23:59:07 +00:00
// Returns true iff an entry that compares equal to key is in the list.
bool Contains(const char* key) const override {
return skip_list_.Contains(key);
}
size_t ApproximateMemoryUsage() override {
// All memory is allocated through allocator; nothing to report here
return 0;
}
void Get(const LookupKey& k, void* callback_args,
bool (*callback_func)(void* arg, const char* entry)) override {
SkipListRep::Iterator iter(&skip_list_);
Slice dummy_slice;
for (iter.Seek(dummy_slice, k.memtable_key().data());
iter.Valid() && callback_func(callback_args, iter.key());
iter.Next()) {
}
}
Add an option to verify memtable key order during reads (#12889) Summary: add a new CF option `paranoid_memory_checks` that allows additional data integrity validations during read/scan. Currently, skiplist-based memtable will validate the order of keys visited. Further data validation can be added in different layers. The option will be opt-in due to performance overhead. The motivation for this feature is for services where data correctness is critical and want to detect in-memory corruption earlier. For a corrupted memtable key, this feature can help to detect it during during reads instead of during flush with existing protections (OutputValidator that verifies key order or per kv checksum). See internally linked task for more context. Pull Request resolved: https://github.com/facebook/rocksdb/pull/12889 Test Plan: * new unit test added for paranoid_memory_checks=true. * existing unit test for paranoid_memory_checks=false. * enable in stress test. Performance Benchmark: we check for performance regression in read path where data is in memtable only. For each benchmark, the script was run at the same time for main and this PR: * Memtable-only randomread ops/sec: ``` (for I in $(seq 1 50);do ./db_bench --benchmarks=fillseq,readrandom --write_buffer_size=268435456 --writes=250000 --num=250000 --reads=500000 --seed=1723056275 2>&1 | grep "readrandom"; done;) | awk '{ t += $5; c++; print } END { print 1.0 * t / c }'; Main: 608146 PR with paranoid_memory_checks=false: 607727 (- %0.07) PR with paranoid_memory_checks=true: 521889 (-%14.2) ``` * Memtable-only sequential scan ops/sec: ``` (for I in $(seq 1 50); do ./db_bench--benchmarks=fillseq,readseq[-X10] --write_buffer_size=268435456 --num=1000000 --seed=1723056275 2>1 | grep "\[AVG 10 runs\]"; done;) | awk '{ t += $6; c++; print; } END { printf "%.0f\n", 1.0 * t / c }'; Main: 9180077 PR with paranoid_memory_checks=false: 9536241 (+%3.8) PR with paranoid_memory_checks=true: 7653934 (-%16.6) ``` * Memtable-only reverse scan ops/sec: ``` (for I in $(seq 1 20); do ./db_bench --benchmarks=fillseq,readreverse[-X10] --write_buffer_size=268435456 --num=1000000 --seed=1723056275 2>1 | grep "\[AVG 10 runs\]"; done;) | awk '{ t += $6; c++; print; } END { printf "%.0f\n", 1.0 * t / c }'; Main: 1285719 PR with integrity_checks=false: 1431626 (+%11.3) PR with integrity_checks=true: 811031 (-%36.9) ``` The `readrandom` benchmark shows no regression. The scanning benchmarks show improvement that I can't explain. Reviewed By: pdillinger Differential Revision: D60414267 Pulled By: cbi42 fbshipit-source-id: a70b0cbeea131f1a249a5f78f9dc3a62dacfaa91
2024-08-19 20:53:25 +00:00
Status GetAndValidate(const LookupKey& k, void* callback_args,
bool (*callback_func)(void* arg, const char* entry),
bool allow_data_in_errors) override {
SkipListRep::Iterator iter(&skip_list_);
Slice dummy_slice;
Status status = iter.SeekAndValidate(dummy_slice, k.memtable_key().data(),
allow_data_in_errors);
for (; iter.Valid() && status.ok() &&
callback_func(callback_args, iter.key());
status = iter.NextAndValidate(allow_data_in_errors)) {
}
return status;
}
uint64_t ApproximateNumEntries(const Slice& start_ikey,
const Slice& end_ikey) override {
Re-implement GetApproximateMemTableStats for skip lists (#13047) Summary: GetApproximateMemTableStats() could return some bad results with the standard skip list memtable. See this new db_bench test showing the dismal distribution of results when the actual number of entries in range is 1000: ``` $ ./db_bench --benchmarks=filluniquerandom,approximatememtablestats,readrandom --value_size=1 --num=1000000 --batch_size=1000 ... filluniquerandom : 1.391 micros/op 718915 ops/sec 1.391 seconds 1000000 operations; 11.7 MB/s approximatememtablestats : 3.711 micros/op 269492 ops/sec 3.711 seconds 1000000 operations; Reported entry count stats (expected 1000): Count: 1000000 Average: 2344.1611 StdDev: 26587.27 Min: 0 Median: 965.8555 Max: 835273 Percentiles: P50: 965.86 P75: 1610.77 P99: 12618.01 P99.9: 74991.58 P99.99: 830970.97 ------------------------------------------------------ [ 0, 1 ] 131344 13.134% 13.134% ### ( 1, 2 ] 115 0.011% 13.146% ( 2, 3 ] 106 0.011% 13.157% ( 3, 4 ] 190 0.019% 13.176% ( 4, 6 ] 214 0.021% 13.197% ( 6, 10 ] 522 0.052% 13.249% ( 10, 15 ] 748 0.075% 13.324% ( 15, 22 ] 1002 0.100% 13.424% ( 22, 34 ] 1948 0.195% 13.619% ( 34, 51 ] 3067 0.307% 13.926% ( 51, 76 ] 4213 0.421% 14.347% ( 76, 110 ] 5721 0.572% 14.919% ( 110, 170 ] 11375 1.137% 16.056% ( 170, 250 ] 17928 1.793% 17.849% ( 250, 380 ] 36597 3.660% 21.509% # ( 380, 580 ] 77882 7.788% 29.297% ## ( 580, 870 ] 160193 16.019% 45.317% ### ( 870, 1300 ] 210098 21.010% 66.326% #### ( 1300, 1900 ] 167461 16.746% 83.072% ### ( 1900, 2900 ] 78678 7.868% 90.940% ## ( 2900, 4400 ] 47743 4.774% 95.715% # ( 4400, 6600 ] 17650 1.765% 97.480% ( 6600, 9900 ] 11895 1.190% 98.669% ( 9900, 14000 ] 4993 0.499% 99.168% ( 14000, 22000 ] 2384 0.238% 99.407% ( 22000, 33000 ] 1966 0.197% 99.603% ( 50000, 75000 ] 2968 0.297% 99.900% ( 570000, 860000 ] 999 0.100% 100.000% readrandom : 1.967 micros/op 508487 ops/sec 1.967 seconds 1000000 operations; 8.2 MB/s (1000000 of 1000000 found) ``` Perhaps the only good thing to say about the old implementation was that it was fast, though apparently not that fast. I've implemented a much more robust and reasonably fast new version of the function. It's still logarithmic but with some larger constant factors. The standard deviation from true count is around 20% or less, and roughly the CPU cost of two memtable point look-ups. See code comments for detail. ``` $ ./db_bench --benchmarks=filluniquerandom,approximatememtablestats,readrandom --value_size=1 --num=1000000 --batch_size=1000 ... filluniquerandom : 1.478 micros/op 676434 ops/sec 1.478 seconds 1000000 operations; 11.0 MB/s approximatememtablestats : 2.694 micros/op 371157 ops/sec 2.694 seconds 1000000 operations; Reported entry count stats (expected 1000): Count: 1000000 Average: 1073.5158 StdDev: 197.80 Min: 608 Median: 1079.9506 Max: 2176 Percentiles: P50: 1079.95 P75: 1223.69 P99: 1852.36 P99.9: 1898.70 P99.99: 2176.00 ------------------------------------------------------ ( 580, 870 ] 134848 13.485% 13.485% ### ( 870, 1300 ] 747868 74.787% 88.272% ############### ( 1300, 1900 ] 116536 11.654% 99.925% ## ( 1900, 2900 ] 748 0.075% 100.000% readrandom : 1.997 micros/op 500654 ops/sec 1.997 seconds 1000000 operations; 8.1 MB/s (1000000 of 1000000 found) ``` We can already see that the distribution of results is dramatically better and wonderfully normal-looking, with relative standard deviation around 20%. The function is also FASTER, at least with these parameters. Let's look how this behavior generalizes, first *much* larger range: ``` $ ./db_bench --benchmarks=filluniquerandom,approximatememtablestats,readrandom --value_size=1 --num=1000000 --batch_size=30000 filluniquerandom : 1.390 micros/op 719654 ops/sec 1.376 seconds 990000 operations; 11.7 MB/s approximatememtablestats : 1.129 micros/op 885649 ops/sec 1.129 seconds 1000000 operations; Reported entry count stats (expected 30000): Count: 1000000 Average: 31098.8795 StdDev: 3601.47 Min: 21504 Median: 29333.9303 Max: 43008 Percentiles: P50: 29333.93 P75: 33018.00 P99: 43008.00 P99.9: 43008.00 P99.99: 43008.00 ------------------------------------------------------ ( 14000, 22000 ] 408 0.041% 0.041% ( 22000, 33000 ] 749327 74.933% 74.974% ############### ( 33000, 50000 ] 250265 25.027% 100.000% ##### readrandom : 1.894 micros/op 528083 ops/sec 1.894 seconds 1000000 operations; 8.5 MB/s (989989 of 1000000 found) ``` This is *even faster* and relatively *more accurate*, with relative standard deviation closer to 10%. Code comments explain why. Now let's look at smaller ranges. Implementation quirks or conveniences: * When actual number in range is >= 40, the minimum return value is 40. * When the actual is <= 10, it is guaranteed to return that actual number. ``` $ ./db_bench --benchmarks=filluniquerandom,approximatememtablestats,readrandom --value_size=1 --num=1000000 --batch_size=75 ... filluniquerandom : 1.417 micros/op 705668 ops/sec 1.417 seconds 999975 operations; 11.4 MB/s approximatememtablestats : 3.342 micros/op 299197 ops/sec 3.342 seconds 1000000 operations; Reported entry count stats (expected 75): Count: 1000000 Average: 75.1210 StdDev: 15.02 Min: 40 Median: 71.9395 Max: 256 Percentiles: P50: 71.94 P75: 89.69 P99: 119.12 P99.9: 166.68 P99.99: 229.78 ------------------------------------------------------ ( 34, 51 ] 38867 3.887% 3.887% # ( 51, 76 ] 550554 55.055% 58.942% ########### ( 76, 110 ] 398854 39.885% 98.828% ######## ( 110, 170 ] 11353 1.135% 99.963% ( 170, 250 ] 364 0.036% 99.999% ( 250, 380 ] 8 0.001% 100.000% readrandom : 1.861 micros/op 537224 ops/sec 1.861 seconds 1000000 operations; 8.7 MB/s (999974 of 1000000 found) $ ./db_bench --benchmarks=filluniquerandom,approximatememtablestats,readrandom --value_size=1 --num=1000000 --batch_size=25 ... filluniquerandom : 1.501 micros/op 666283 ops/sec 1.501 seconds 1000000 operations; 10.8 MB/s approximatememtablestats : 5.118 micros/op 195401 ops/sec 5.118 seconds 1000000 operations; Reported entry count stats (expected 25): Count: 1000000 Average: 26.2392 StdDev: 4.58 Min: 25 Median: 28.4590 Max: 72 Percentiles: P50: 28.46 P75: 31.69 P99: 49.27 P99.9: 67.95 P99.99: 72.00 ------------------------------------------------------ ( 22, 34 ] 928936 92.894% 92.894% ################### ( 34, 51 ] 67960 6.796% 99.690% # ( 51, 76 ] 3104 0.310% 100.000% readrandom : 1.892 micros/op 528595 ops/sec 1.892 seconds 1000000 operations; 8.6 MB/s (1000000 of 1000000 found) $ ./db_bench --benchmarks=filluniquerandom,approximatememtablestats,readrandom --value_size=1 --num=1000000 --batch_size=10 ... filluniquerandom : 1.642 micros/op 608916 ops/sec 1.642 seconds 1000000 operations; 9.9 MB/s approximatememtablestats : 3.042 micros/op 328721 ops/sec 3.042 seconds 1000000 operations; Reported entry count stats (expected 10): Count: 1000000 Average: 10.0000 StdDev: 0.00 Min: 10 Median: 10.0000 Max: 10 Percentiles: P50: 10.00 P75: 10.00 P99: 10.00 P99.9: 10.00 P99.99: 10.00 ------------------------------------------------------ ( 6, 10 ] 1000000 100.000% 100.000% #################### readrandom : 1.805 micros/op 554126 ops/sec 1.805 seconds 1000000 operations; 9.0 MB/s (1000000 of 1000000 found) ``` Remarkably consistent. Pull Request resolved: https://github.com/facebook/rocksdb/pull/13047 Test Plan: new db_bench test for both performance and accuracy (see above); added to crash test; unit test updated. Reviewed By: cbi42 Differential Revision: D63722003 Pulled By: pdillinger fbshipit-source-id: cfc8613c085e87c17ecec22d82601aac2a5a1b26
2024-10-02 21:25:50 +00:00
return skip_list_.ApproximateNumEntries(start_ikey, end_ikey);
}
void UniqueRandomSample(const uint64_t num_entries,
const uint64_t target_sample_size,
Memtable sampling for mempurge heuristic. (#8628) Summary: Changes the API of the MemPurge process: the `bool experimental_allow_mempurge` and `experimental_mempurge_policy` flags have been replaced by a `double experimental_mempurge_threshold` option. This change of API reflects another major change introduced in this PR: the MemPurgeDecider() function now works by sampling the memtables being flushed to estimate the overall amount of useful payload (payload minus the garbage), and then compare this useful payload estimate with the `double experimental_mempurge_threshold` value. Therefore, when the value of this flag is `0.0` (default value), mempurge is simply deactivated. On the other hand, a value of `DBL_MAX` would be equivalent to always going through a mempurge regardless of the garbage ratio estimate. At the moment, a `double experimental_mempurge_threshold` value else than 0.0 or `DBL_MAX` is opnly supported`with the `SkipList` memtable representation. Regarding the sampling, this PR includes the introduction of a `MemTable::UniqueRandomSample` function that collects (approximately) random entries from the memtable by using the new `SkipList::Iterator::RandomSeek()` under the hood, or by iterating through each memtable entry, depending on the target sample size and the total number of entries. The unit tests have been readapted to support this new API. Pull Request resolved: https://github.com/facebook/rocksdb/pull/8628 Reviewed By: pdillinger Differential Revision: D30149315 Pulled By: bjlemaire fbshipit-source-id: 1feef5390c95db6f4480ab4434716533d3947f27
2021-08-11 01:07:48 +00:00
std::unordered_set<const char*>* entries) override {
entries->clear();
// Avoid divide-by-0.
assert(target_sample_size > 0);
assert(num_entries > 0);
// NOTE: the size of entries is not enforced to be exactly
// target_sample_size at the end of this function, it might be slightly
// greater or smaller.
SkipListRep::Iterator iter(&skip_list_);
// There are two methods to create the subset of samples (size m)
// from the table containing N elements:
// 1-Iterate linearly through the N memtable entries. For each entry i,
// add it to the sample set with a probability
// (target_sample_size - entries.size() ) / (N-i).
//
// 2-Pick m random elements without repetition.
// We pick Option 2 when m<sqrt(N) and
// Option 1 when m > sqrt(N).
if (target_sample_size >
static_cast<uint64_t>(std::sqrt(1.0 * num_entries))) {
Random* rnd = Random::GetTLSInstance();
iter.SeekToFirst();
uint64_t counter = 0, num_samples_left = target_sample_size;
for (; iter.Valid() && (num_samples_left > 0); iter.Next(), counter++) {
// Add entry to sample set with probability
// num_samples_left/(num_entries - counter).
if (rnd->Next() % (num_entries - counter) < num_samples_left) {
entries->insert(iter.key());
num_samples_left--;
}
}
} else {
// Option 2: pick m random elements with no duplicates.
// If Option 2 is picked, then target_sample_size<sqrt(N)
// Using a set spares the need to check for duplicates.
for (uint64_t i = 0; i < target_sample_size; i++) {
// We give it 5 attempts to find a non-duplicate
// With 5 attempts, the chances of returning `entries` set
// of size target_sample_size is:
// PROD_{i=1}^{target_sample_size-1} [1-(i/N)^5]
// which is monotonically increasing with N in the worse case
// of target_sample_size=sqrt(N), and is always >99.9% for N>4.
// At worst, for the final pick , when m=sqrt(N) there is
// a probability of p= 1/sqrt(N) chances to find a duplicate.
for (uint64_t j = 0; j < 5; j++) {
iter.RandomSeek();
// unordered_set::insert returns pair<iterator, bool>.
// The second element is true if an insert successfully happened.
// If element is already in the set, this bool will be false, and
// true otherwise.
if ((entries->insert(iter.key())).second) {
break;
}
}
}
}
}
~SkipListRep() override = default;
// Iteration over the contents of a skip list
class Iterator : public MemTableRep::Iterator {
2015-11-19 22:24:29 +00:00
InlineSkipList<const MemTableRep::KeyComparator&>::Iterator iter_;
public:
// Initialize an iterator over the specified list.
// The returned iterator is not valid.
explicit Iterator(
2015-11-19 22:24:29 +00:00
const InlineSkipList<const MemTableRep::KeyComparator&>* list)
: iter_(list) {}
~Iterator() override = default;
// Returns true iff the iterator is positioned at a valid node.
bool Valid() const override { return iter_.Valid(); }
// Returns the key at the current position.
// REQUIRES: Valid()
Add an option to verify memtable key order during reads (#12889) Summary: add a new CF option `paranoid_memory_checks` that allows additional data integrity validations during read/scan. Currently, skiplist-based memtable will validate the order of keys visited. Further data validation can be added in different layers. The option will be opt-in due to performance overhead. The motivation for this feature is for services where data correctness is critical and want to detect in-memory corruption earlier. For a corrupted memtable key, this feature can help to detect it during during reads instead of during flush with existing protections (OutputValidator that verifies key order or per kv checksum). See internally linked task for more context. Pull Request resolved: https://github.com/facebook/rocksdb/pull/12889 Test Plan: * new unit test added for paranoid_memory_checks=true. * existing unit test for paranoid_memory_checks=false. * enable in stress test. Performance Benchmark: we check for performance regression in read path where data is in memtable only. For each benchmark, the script was run at the same time for main and this PR: * Memtable-only randomread ops/sec: ``` (for I in $(seq 1 50);do ./db_bench --benchmarks=fillseq,readrandom --write_buffer_size=268435456 --writes=250000 --num=250000 --reads=500000 --seed=1723056275 2>&1 | grep "readrandom"; done;) | awk '{ t += $5; c++; print } END { print 1.0 * t / c }'; Main: 608146 PR with paranoid_memory_checks=false: 607727 (- %0.07) PR with paranoid_memory_checks=true: 521889 (-%14.2) ``` * Memtable-only sequential scan ops/sec: ``` (for I in $(seq 1 50); do ./db_bench--benchmarks=fillseq,readseq[-X10] --write_buffer_size=268435456 --num=1000000 --seed=1723056275 2>1 | grep "\[AVG 10 runs\]"; done;) | awk '{ t += $6; c++; print; } END { printf "%.0f\n", 1.0 * t / c }'; Main: 9180077 PR with paranoid_memory_checks=false: 9536241 (+%3.8) PR with paranoid_memory_checks=true: 7653934 (-%16.6) ``` * Memtable-only reverse scan ops/sec: ``` (for I in $(seq 1 20); do ./db_bench --benchmarks=fillseq,readreverse[-X10] --write_buffer_size=268435456 --num=1000000 --seed=1723056275 2>1 | grep "\[AVG 10 runs\]"; done;) | awk '{ t += $6; c++; print; } END { printf "%.0f\n", 1.0 * t / c }'; Main: 1285719 PR with integrity_checks=false: 1431626 (+%11.3) PR with integrity_checks=true: 811031 (-%36.9) ``` The `readrandom` benchmark shows no regression. The scanning benchmarks show improvement that I can't explain. Reviewed By: pdillinger Differential Revision: D60414267 Pulled By: cbi42 fbshipit-source-id: a70b0cbeea131f1a249a5f78f9dc3a62dacfaa91
2024-08-19 20:53:25 +00:00
const char* key() const override {
assert(Valid());
return iter_.key();
}
// Advances to the next position.
// REQUIRES: Valid()
Add an option to verify memtable key order during reads (#12889) Summary: add a new CF option `paranoid_memory_checks` that allows additional data integrity validations during read/scan. Currently, skiplist-based memtable will validate the order of keys visited. Further data validation can be added in different layers. The option will be opt-in due to performance overhead. The motivation for this feature is for services where data correctness is critical and want to detect in-memory corruption earlier. For a corrupted memtable key, this feature can help to detect it during during reads instead of during flush with existing protections (OutputValidator that verifies key order or per kv checksum). See internally linked task for more context. Pull Request resolved: https://github.com/facebook/rocksdb/pull/12889 Test Plan: * new unit test added for paranoid_memory_checks=true. * existing unit test for paranoid_memory_checks=false. * enable in stress test. Performance Benchmark: we check for performance regression in read path where data is in memtable only. For each benchmark, the script was run at the same time for main and this PR: * Memtable-only randomread ops/sec: ``` (for I in $(seq 1 50);do ./db_bench --benchmarks=fillseq,readrandom --write_buffer_size=268435456 --writes=250000 --num=250000 --reads=500000 --seed=1723056275 2>&1 | grep "readrandom"; done;) | awk '{ t += $5; c++; print } END { print 1.0 * t / c }'; Main: 608146 PR with paranoid_memory_checks=false: 607727 (- %0.07) PR with paranoid_memory_checks=true: 521889 (-%14.2) ``` * Memtable-only sequential scan ops/sec: ``` (for I in $(seq 1 50); do ./db_bench--benchmarks=fillseq,readseq[-X10] --write_buffer_size=268435456 --num=1000000 --seed=1723056275 2>1 | grep "\[AVG 10 runs\]"; done;) | awk '{ t += $6; c++; print; } END { printf "%.0f\n", 1.0 * t / c }'; Main: 9180077 PR with paranoid_memory_checks=false: 9536241 (+%3.8) PR with paranoid_memory_checks=true: 7653934 (-%16.6) ``` * Memtable-only reverse scan ops/sec: ``` (for I in $(seq 1 20); do ./db_bench --benchmarks=fillseq,readreverse[-X10] --write_buffer_size=268435456 --num=1000000 --seed=1723056275 2>1 | grep "\[AVG 10 runs\]"; done;) | awk '{ t += $6; c++; print; } END { printf "%.0f\n", 1.0 * t / c }'; Main: 1285719 PR with integrity_checks=false: 1431626 (+%11.3) PR with integrity_checks=true: 811031 (-%36.9) ``` The `readrandom` benchmark shows no regression. The scanning benchmarks show improvement that I can't explain. Reviewed By: pdillinger Differential Revision: D60414267 Pulled By: cbi42 fbshipit-source-id: a70b0cbeea131f1a249a5f78f9dc3a62dacfaa91
2024-08-19 20:53:25 +00:00
void Next() override {
assert(Valid());
iter_.Next();
}
// Advances to the previous position.
// REQUIRES: Valid()
Add an option to verify memtable key order during reads (#12889) Summary: add a new CF option `paranoid_memory_checks` that allows additional data integrity validations during read/scan. Currently, skiplist-based memtable will validate the order of keys visited. Further data validation can be added in different layers. The option will be opt-in due to performance overhead. The motivation for this feature is for services where data correctness is critical and want to detect in-memory corruption earlier. For a corrupted memtable key, this feature can help to detect it during during reads instead of during flush with existing protections (OutputValidator that verifies key order or per kv checksum). See internally linked task for more context. Pull Request resolved: https://github.com/facebook/rocksdb/pull/12889 Test Plan: * new unit test added for paranoid_memory_checks=true. * existing unit test for paranoid_memory_checks=false. * enable in stress test. Performance Benchmark: we check for performance regression in read path where data is in memtable only. For each benchmark, the script was run at the same time for main and this PR: * Memtable-only randomread ops/sec: ``` (for I in $(seq 1 50);do ./db_bench --benchmarks=fillseq,readrandom --write_buffer_size=268435456 --writes=250000 --num=250000 --reads=500000 --seed=1723056275 2>&1 | grep "readrandom"; done;) | awk '{ t += $5; c++; print } END { print 1.0 * t / c }'; Main: 608146 PR with paranoid_memory_checks=false: 607727 (- %0.07) PR with paranoid_memory_checks=true: 521889 (-%14.2) ``` * Memtable-only sequential scan ops/sec: ``` (for I in $(seq 1 50); do ./db_bench--benchmarks=fillseq,readseq[-X10] --write_buffer_size=268435456 --num=1000000 --seed=1723056275 2>1 | grep "\[AVG 10 runs\]"; done;) | awk '{ t += $6; c++; print; } END { printf "%.0f\n", 1.0 * t / c }'; Main: 9180077 PR with paranoid_memory_checks=false: 9536241 (+%3.8) PR with paranoid_memory_checks=true: 7653934 (-%16.6) ``` * Memtable-only reverse scan ops/sec: ``` (for I in $(seq 1 20); do ./db_bench --benchmarks=fillseq,readreverse[-X10] --write_buffer_size=268435456 --num=1000000 --seed=1723056275 2>1 | grep "\[AVG 10 runs\]"; done;) | awk '{ t += $6; c++; print; } END { printf "%.0f\n", 1.0 * t / c }'; Main: 1285719 PR with integrity_checks=false: 1431626 (+%11.3) PR with integrity_checks=true: 811031 (-%36.9) ``` The `readrandom` benchmark shows no regression. The scanning benchmarks show improvement that I can't explain. Reviewed By: pdillinger Differential Revision: D60414267 Pulled By: cbi42 fbshipit-source-id: a70b0cbeea131f1a249a5f78f9dc3a62dacfaa91
2024-08-19 20:53:25 +00:00
void Prev() override {
assert(Valid());
iter_.Prev();
}
// Advance to the first entry with a key >= target
void Seek(const Slice& user_key, const char* memtable_key) override {
if (memtable_key != nullptr) {
iter_.Seek(memtable_key);
} else {
iter_.Seek(EncodeKey(&tmp_, user_key));
}
}
// Retreat to the last entry with a key <= target
void SeekForPrev(const Slice& user_key, const char* memtable_key) override {
if (memtable_key != nullptr) {
iter_.SeekForPrev(memtable_key);
} else {
iter_.SeekForPrev(EncodeKey(&tmp_, user_key));
}
}
Memtable sampling for mempurge heuristic. (#8628) Summary: Changes the API of the MemPurge process: the `bool experimental_allow_mempurge` and `experimental_mempurge_policy` flags have been replaced by a `double experimental_mempurge_threshold` option. This change of API reflects another major change introduced in this PR: the MemPurgeDecider() function now works by sampling the memtables being flushed to estimate the overall amount of useful payload (payload minus the garbage), and then compare this useful payload estimate with the `double experimental_mempurge_threshold` value. Therefore, when the value of this flag is `0.0` (default value), mempurge is simply deactivated. On the other hand, a value of `DBL_MAX` would be equivalent to always going through a mempurge regardless of the garbage ratio estimate. At the moment, a `double experimental_mempurge_threshold` value else than 0.0 or `DBL_MAX` is opnly supported`with the `SkipList` memtable representation. Regarding the sampling, this PR includes the introduction of a `MemTable::UniqueRandomSample` function that collects (approximately) random entries from the memtable by using the new `SkipList::Iterator::RandomSeek()` under the hood, or by iterating through each memtable entry, depending on the target sample size and the total number of entries. The unit tests have been readapted to support this new API. Pull Request resolved: https://github.com/facebook/rocksdb/pull/8628 Reviewed By: pdillinger Differential Revision: D30149315 Pulled By: bjlemaire fbshipit-source-id: 1feef5390c95db6f4480ab4434716533d3947f27
2021-08-11 01:07:48 +00:00
void RandomSeek() override { iter_.RandomSeek(); }
// Position at the first entry in list.
// Final state of iterator is Valid() iff list is not empty.
void SeekToFirst() override { iter_.SeekToFirst(); }
// Position at the last entry in list.
// Final state of iterator is Valid() iff list is not empty.
void SeekToLast() override { iter_.SeekToLast(); }
Add an option to verify memtable key order during reads (#12889) Summary: add a new CF option `paranoid_memory_checks` that allows additional data integrity validations during read/scan. Currently, skiplist-based memtable will validate the order of keys visited. Further data validation can be added in different layers. The option will be opt-in due to performance overhead. The motivation for this feature is for services where data correctness is critical and want to detect in-memory corruption earlier. For a corrupted memtable key, this feature can help to detect it during during reads instead of during flush with existing protections (OutputValidator that verifies key order or per kv checksum). See internally linked task for more context. Pull Request resolved: https://github.com/facebook/rocksdb/pull/12889 Test Plan: * new unit test added for paranoid_memory_checks=true. * existing unit test for paranoid_memory_checks=false. * enable in stress test. Performance Benchmark: we check for performance regression in read path where data is in memtable only. For each benchmark, the script was run at the same time for main and this PR: * Memtable-only randomread ops/sec: ``` (for I in $(seq 1 50);do ./db_bench --benchmarks=fillseq,readrandom --write_buffer_size=268435456 --writes=250000 --num=250000 --reads=500000 --seed=1723056275 2>&1 | grep "readrandom"; done;) | awk '{ t += $5; c++; print } END { print 1.0 * t / c }'; Main: 608146 PR with paranoid_memory_checks=false: 607727 (- %0.07) PR with paranoid_memory_checks=true: 521889 (-%14.2) ``` * Memtable-only sequential scan ops/sec: ``` (for I in $(seq 1 50); do ./db_bench--benchmarks=fillseq,readseq[-X10] --write_buffer_size=268435456 --num=1000000 --seed=1723056275 2>1 | grep "\[AVG 10 runs\]"; done;) | awk '{ t += $6; c++; print; } END { printf "%.0f\n", 1.0 * t / c }'; Main: 9180077 PR with paranoid_memory_checks=false: 9536241 (+%3.8) PR with paranoid_memory_checks=true: 7653934 (-%16.6) ``` * Memtable-only reverse scan ops/sec: ``` (for I in $(seq 1 20); do ./db_bench --benchmarks=fillseq,readreverse[-X10] --write_buffer_size=268435456 --num=1000000 --seed=1723056275 2>1 | grep "\[AVG 10 runs\]"; done;) | awk '{ t += $6; c++; print; } END { printf "%.0f\n", 1.0 * t / c }'; Main: 1285719 PR with integrity_checks=false: 1431626 (+%11.3) PR with integrity_checks=true: 811031 (-%36.9) ``` The `readrandom` benchmark shows no regression. The scanning benchmarks show improvement that I can't explain. Reviewed By: pdillinger Differential Revision: D60414267 Pulled By: cbi42 fbshipit-source-id: a70b0cbeea131f1a249a5f78f9dc3a62dacfaa91
2024-08-19 20:53:25 +00:00
Status NextAndValidate(bool allow_data_in_errors) override {
assert(Valid());
return iter_.NextAndValidate(allow_data_in_errors);
}
Status SeekAndValidate(const Slice& user_key, const char* memtable_key,
bool allow_data_in_errors) override {
if (memtable_key != nullptr) {
return iter_.SeekAndValidate(memtable_key, allow_data_in_errors);
} else {
return iter_.SeekAndValidate(EncodeKey(&tmp_, user_key),
allow_data_in_errors);
}
}
Status PrevAndValidate(bool allow_data_in_error) override {
assert(Valid());
return iter_.PrevAndValidate(allow_data_in_error);
}
protected:
std::string tmp_; // For passing to EncodeKey
};
SkipListRep::LookaheadIterator Summary: This diff introduces the `lookahead` argument to `SkipListFactory()`. This is an optimization for the tailing use case which includes many seeks. E.g. consider the following operations on a skip list iterator: Seek(x), Next(), Next(), Seek(x+2), Next(), Seek(x+3), Next(), Next(), ... If `lookahead` is positive, `SkipListRep` will return an iterator which also keeps track of the previously visited node. Seek() then first does a linear search starting from that node (up to `lookahead` steps). As in the tailing example above, this may require fewer than ~log(n) comparisons as with regular skip list search. Test Plan: Added a new benchmark (`fillseekseq`) which simulates the usage pattern. It first writes N records (with consecutive keys), then measures how much time it takes to read them by calling `Seek()` and `Next()`. $ time ./db_bench -num 10000000 -benchmarks fillseekseq -prefix_size 1 \ -key_size 8 -write_buffer_size $[1024*1024*1024] -value_size 50 \ -seekseq_next 2 -skip_list_lookahead=0 [...] DB path: [/dev/shm/rocksdbtest/dbbench] fillseekseq : 0.389 micros/op 2569047 ops/sec; real 0m21.806s user 0m12.106s sys 0m9.672s $ time ./db_bench [...] -skip_list_lookahead=2 [...] DB path: [/dev/shm/rocksdbtest/dbbench] fillseekseq : 0.153 micros/op 6540684 ops/sec; real 0m19.469s user 0m10.192s sys 0m9.252s Reviewers: ljin, sdong, igor Reviewed By: igor Subscribers: dhruba, leveldb, march, lovro Differential Revision: https://reviews.facebook.net/D23997
2014-09-23 22:52:28 +00:00
// Iterator over the contents of a skip list which also keeps track of the
// previously visited node. In Seek(), it examines a few nodes after it
// first, falling back to O(log n) search from the head of the list only if
// the target key hasn't been found.
class LookaheadIterator : public MemTableRep::Iterator {
public:
explicit LookaheadIterator(const SkipListRep& rep)
: rep_(rep), iter_(&rep_.skip_list_), prev_(iter_) {}
SkipListRep::LookaheadIterator Summary: This diff introduces the `lookahead` argument to `SkipListFactory()`. This is an optimization for the tailing use case which includes many seeks. E.g. consider the following operations on a skip list iterator: Seek(x), Next(), Next(), Seek(x+2), Next(), Seek(x+3), Next(), Next(), ... If `lookahead` is positive, `SkipListRep` will return an iterator which also keeps track of the previously visited node. Seek() then first does a linear search starting from that node (up to `lookahead` steps). As in the tailing example above, this may require fewer than ~log(n) comparisons as with regular skip list search. Test Plan: Added a new benchmark (`fillseekseq`) which simulates the usage pattern. It first writes N records (with consecutive keys), then measures how much time it takes to read them by calling `Seek()` and `Next()`. $ time ./db_bench -num 10000000 -benchmarks fillseekseq -prefix_size 1 \ -key_size 8 -write_buffer_size $[1024*1024*1024] -value_size 50 \ -seekseq_next 2 -skip_list_lookahead=0 [...] DB path: [/dev/shm/rocksdbtest/dbbench] fillseekseq : 0.389 micros/op 2569047 ops/sec; real 0m21.806s user 0m12.106s sys 0m9.672s $ time ./db_bench [...] -skip_list_lookahead=2 [...] DB path: [/dev/shm/rocksdbtest/dbbench] fillseekseq : 0.153 micros/op 6540684 ops/sec; real 0m19.469s user 0m10.192s sys 0m9.252s Reviewers: ljin, sdong, igor Reviewed By: igor Subscribers: dhruba, leveldb, march, lovro Differential Revision: https://reviews.facebook.net/D23997
2014-09-23 22:52:28 +00:00
~LookaheadIterator() override = default;
SkipListRep::LookaheadIterator Summary: This diff introduces the `lookahead` argument to `SkipListFactory()`. This is an optimization for the tailing use case which includes many seeks. E.g. consider the following operations on a skip list iterator: Seek(x), Next(), Next(), Seek(x+2), Next(), Seek(x+3), Next(), Next(), ... If `lookahead` is positive, `SkipListRep` will return an iterator which also keeps track of the previously visited node. Seek() then first does a linear search starting from that node (up to `lookahead` steps). As in the tailing example above, this may require fewer than ~log(n) comparisons as with regular skip list search. Test Plan: Added a new benchmark (`fillseekseq`) which simulates the usage pattern. It first writes N records (with consecutive keys), then measures how much time it takes to read them by calling `Seek()` and `Next()`. $ time ./db_bench -num 10000000 -benchmarks fillseekseq -prefix_size 1 \ -key_size 8 -write_buffer_size $[1024*1024*1024] -value_size 50 \ -seekseq_next 2 -skip_list_lookahead=0 [...] DB path: [/dev/shm/rocksdbtest/dbbench] fillseekseq : 0.389 micros/op 2569047 ops/sec; real 0m21.806s user 0m12.106s sys 0m9.672s $ time ./db_bench [...] -skip_list_lookahead=2 [...] DB path: [/dev/shm/rocksdbtest/dbbench] fillseekseq : 0.153 micros/op 6540684 ops/sec; real 0m19.469s user 0m10.192s sys 0m9.252s Reviewers: ljin, sdong, igor Reviewed By: igor Subscribers: dhruba, leveldb, march, lovro Differential Revision: https://reviews.facebook.net/D23997
2014-09-23 22:52:28 +00:00
bool Valid() const override { return iter_.Valid(); }
SkipListRep::LookaheadIterator Summary: This diff introduces the `lookahead` argument to `SkipListFactory()`. This is an optimization for the tailing use case which includes many seeks. E.g. consider the following operations on a skip list iterator: Seek(x), Next(), Next(), Seek(x+2), Next(), Seek(x+3), Next(), Next(), ... If `lookahead` is positive, `SkipListRep` will return an iterator which also keeps track of the previously visited node. Seek() then first does a linear search starting from that node (up to `lookahead` steps). As in the tailing example above, this may require fewer than ~log(n) comparisons as with regular skip list search. Test Plan: Added a new benchmark (`fillseekseq`) which simulates the usage pattern. It first writes N records (with consecutive keys), then measures how much time it takes to read them by calling `Seek()` and `Next()`. $ time ./db_bench -num 10000000 -benchmarks fillseekseq -prefix_size 1 \ -key_size 8 -write_buffer_size $[1024*1024*1024] -value_size 50 \ -seekseq_next 2 -skip_list_lookahead=0 [...] DB path: [/dev/shm/rocksdbtest/dbbench] fillseekseq : 0.389 micros/op 2569047 ops/sec; real 0m21.806s user 0m12.106s sys 0m9.672s $ time ./db_bench [...] -skip_list_lookahead=2 [...] DB path: [/dev/shm/rocksdbtest/dbbench] fillseekseq : 0.153 micros/op 6540684 ops/sec; real 0m19.469s user 0m10.192s sys 0m9.252s Reviewers: ljin, sdong, igor Reviewed By: igor Subscribers: dhruba, leveldb, march, lovro Differential Revision: https://reviews.facebook.net/D23997
2014-09-23 22:52:28 +00:00
const char* key() const override {
SkipListRep::LookaheadIterator Summary: This diff introduces the `lookahead` argument to `SkipListFactory()`. This is an optimization for the tailing use case which includes many seeks. E.g. consider the following operations on a skip list iterator: Seek(x), Next(), Next(), Seek(x+2), Next(), Seek(x+3), Next(), Next(), ... If `lookahead` is positive, `SkipListRep` will return an iterator which also keeps track of the previously visited node. Seek() then first does a linear search starting from that node (up to `lookahead` steps). As in the tailing example above, this may require fewer than ~log(n) comparisons as with regular skip list search. Test Plan: Added a new benchmark (`fillseekseq`) which simulates the usage pattern. It first writes N records (with consecutive keys), then measures how much time it takes to read them by calling `Seek()` and `Next()`. $ time ./db_bench -num 10000000 -benchmarks fillseekseq -prefix_size 1 \ -key_size 8 -write_buffer_size $[1024*1024*1024] -value_size 50 \ -seekseq_next 2 -skip_list_lookahead=0 [...] DB path: [/dev/shm/rocksdbtest/dbbench] fillseekseq : 0.389 micros/op 2569047 ops/sec; real 0m21.806s user 0m12.106s sys 0m9.672s $ time ./db_bench [...] -skip_list_lookahead=2 [...] DB path: [/dev/shm/rocksdbtest/dbbench] fillseekseq : 0.153 micros/op 6540684 ops/sec; real 0m19.469s user 0m10.192s sys 0m9.252s Reviewers: ljin, sdong, igor Reviewed By: igor Subscribers: dhruba, leveldb, march, lovro Differential Revision: https://reviews.facebook.net/D23997
2014-09-23 22:52:28 +00:00
assert(Valid());
return iter_.key();
}
void Next() override {
SkipListRep::LookaheadIterator Summary: This diff introduces the `lookahead` argument to `SkipListFactory()`. This is an optimization for the tailing use case which includes many seeks. E.g. consider the following operations on a skip list iterator: Seek(x), Next(), Next(), Seek(x+2), Next(), Seek(x+3), Next(), Next(), ... If `lookahead` is positive, `SkipListRep` will return an iterator which also keeps track of the previously visited node. Seek() then first does a linear search starting from that node (up to `lookahead` steps). As in the tailing example above, this may require fewer than ~log(n) comparisons as with regular skip list search. Test Plan: Added a new benchmark (`fillseekseq`) which simulates the usage pattern. It first writes N records (with consecutive keys), then measures how much time it takes to read them by calling `Seek()` and `Next()`. $ time ./db_bench -num 10000000 -benchmarks fillseekseq -prefix_size 1 \ -key_size 8 -write_buffer_size $[1024*1024*1024] -value_size 50 \ -seekseq_next 2 -skip_list_lookahead=0 [...] DB path: [/dev/shm/rocksdbtest/dbbench] fillseekseq : 0.389 micros/op 2569047 ops/sec; real 0m21.806s user 0m12.106s sys 0m9.672s $ time ./db_bench [...] -skip_list_lookahead=2 [...] DB path: [/dev/shm/rocksdbtest/dbbench] fillseekseq : 0.153 micros/op 6540684 ops/sec; real 0m19.469s user 0m10.192s sys 0m9.252s Reviewers: ljin, sdong, igor Reviewed By: igor Subscribers: dhruba, leveldb, march, lovro Differential Revision: https://reviews.facebook.net/D23997
2014-09-23 22:52:28 +00:00
assert(Valid());
bool advance_prev = true;
if (prev_.Valid()) {
auto k1 = rep_.UserKey(prev_.key());
auto k2 = rep_.UserKey(iter_.key());
if (k1.compare(k2) == 0) {
// same user key, don't move prev_
advance_prev = false;
} else if (rep_.transform_) {
// only advance prev_ if it has the same prefix as iter_
auto t1 = rep_.transform_->Transform(k1);
auto t2 = rep_.transform_->Transform(k2);
advance_prev = t1.compare(t2) == 0;
}
}
if (advance_prev) {
prev_ = iter_;
}
iter_.Next();
}
void Prev() override {
SkipListRep::LookaheadIterator Summary: This diff introduces the `lookahead` argument to `SkipListFactory()`. This is an optimization for the tailing use case which includes many seeks. E.g. consider the following operations on a skip list iterator: Seek(x), Next(), Next(), Seek(x+2), Next(), Seek(x+3), Next(), Next(), ... If `lookahead` is positive, `SkipListRep` will return an iterator which also keeps track of the previously visited node. Seek() then first does a linear search starting from that node (up to `lookahead` steps). As in the tailing example above, this may require fewer than ~log(n) comparisons as with regular skip list search. Test Plan: Added a new benchmark (`fillseekseq`) which simulates the usage pattern. It first writes N records (with consecutive keys), then measures how much time it takes to read them by calling `Seek()` and `Next()`. $ time ./db_bench -num 10000000 -benchmarks fillseekseq -prefix_size 1 \ -key_size 8 -write_buffer_size $[1024*1024*1024] -value_size 50 \ -seekseq_next 2 -skip_list_lookahead=0 [...] DB path: [/dev/shm/rocksdbtest/dbbench] fillseekseq : 0.389 micros/op 2569047 ops/sec; real 0m21.806s user 0m12.106s sys 0m9.672s $ time ./db_bench [...] -skip_list_lookahead=2 [...] DB path: [/dev/shm/rocksdbtest/dbbench] fillseekseq : 0.153 micros/op 6540684 ops/sec; real 0m19.469s user 0m10.192s sys 0m9.252s Reviewers: ljin, sdong, igor Reviewed By: igor Subscribers: dhruba, leveldb, march, lovro Differential Revision: https://reviews.facebook.net/D23997
2014-09-23 22:52:28 +00:00
assert(Valid());
iter_.Prev();
prev_ = iter_;
}
void Seek(const Slice& internal_key, const char* memtable_key) override {
const char* encoded_key = (memtable_key != nullptr)
? memtable_key
: EncodeKey(&tmp_, internal_key);
SkipListRep::LookaheadIterator Summary: This diff introduces the `lookahead` argument to `SkipListFactory()`. This is an optimization for the tailing use case which includes many seeks. E.g. consider the following operations on a skip list iterator: Seek(x), Next(), Next(), Seek(x+2), Next(), Seek(x+3), Next(), Next(), ... If `lookahead` is positive, `SkipListRep` will return an iterator which also keeps track of the previously visited node. Seek() then first does a linear search starting from that node (up to `lookahead` steps). As in the tailing example above, this may require fewer than ~log(n) comparisons as with regular skip list search. Test Plan: Added a new benchmark (`fillseekseq`) which simulates the usage pattern. It first writes N records (with consecutive keys), then measures how much time it takes to read them by calling `Seek()` and `Next()`. $ time ./db_bench -num 10000000 -benchmarks fillseekseq -prefix_size 1 \ -key_size 8 -write_buffer_size $[1024*1024*1024] -value_size 50 \ -seekseq_next 2 -skip_list_lookahead=0 [...] DB path: [/dev/shm/rocksdbtest/dbbench] fillseekseq : 0.389 micros/op 2569047 ops/sec; real 0m21.806s user 0m12.106s sys 0m9.672s $ time ./db_bench [...] -skip_list_lookahead=2 [...] DB path: [/dev/shm/rocksdbtest/dbbench] fillseekseq : 0.153 micros/op 6540684 ops/sec; real 0m19.469s user 0m10.192s sys 0m9.252s Reviewers: ljin, sdong, igor Reviewed By: igor Subscribers: dhruba, leveldb, march, lovro Differential Revision: https://reviews.facebook.net/D23997
2014-09-23 22:52:28 +00:00
if (prev_.Valid() && rep_.cmp_(encoded_key, prev_.key()) >= 0) {
// prev_.key() is smaller or equal to our target key; do a quick
// linear search (at most lookahead_ steps) starting from prev_
iter_ = prev_;
size_t cur = 0;
while (cur++ <= rep_.lookahead_ && iter_.Valid()) {
if (rep_.cmp_(encoded_key, iter_.key()) <= 0) {
return;
}
Next();
}
}
iter_.Seek(encoded_key);
prev_ = iter_;
}
void SeekForPrev(const Slice& internal_key,
const char* memtable_key) override {
const char* encoded_key = (memtable_key != nullptr)
? memtable_key
: EncodeKey(&tmp_, internal_key);
iter_.SeekForPrev(encoded_key);
prev_ = iter_;
}
void SeekToFirst() override {
SkipListRep::LookaheadIterator Summary: This diff introduces the `lookahead` argument to `SkipListFactory()`. This is an optimization for the tailing use case which includes many seeks. E.g. consider the following operations on a skip list iterator: Seek(x), Next(), Next(), Seek(x+2), Next(), Seek(x+3), Next(), Next(), ... If `lookahead` is positive, `SkipListRep` will return an iterator which also keeps track of the previously visited node. Seek() then first does a linear search starting from that node (up to `lookahead` steps). As in the tailing example above, this may require fewer than ~log(n) comparisons as with regular skip list search. Test Plan: Added a new benchmark (`fillseekseq`) which simulates the usage pattern. It first writes N records (with consecutive keys), then measures how much time it takes to read them by calling `Seek()` and `Next()`. $ time ./db_bench -num 10000000 -benchmarks fillseekseq -prefix_size 1 \ -key_size 8 -write_buffer_size $[1024*1024*1024] -value_size 50 \ -seekseq_next 2 -skip_list_lookahead=0 [...] DB path: [/dev/shm/rocksdbtest/dbbench] fillseekseq : 0.389 micros/op 2569047 ops/sec; real 0m21.806s user 0m12.106s sys 0m9.672s $ time ./db_bench [...] -skip_list_lookahead=2 [...] DB path: [/dev/shm/rocksdbtest/dbbench] fillseekseq : 0.153 micros/op 6540684 ops/sec; real 0m19.469s user 0m10.192s sys 0m9.252s Reviewers: ljin, sdong, igor Reviewed By: igor Subscribers: dhruba, leveldb, march, lovro Differential Revision: https://reviews.facebook.net/D23997
2014-09-23 22:52:28 +00:00
iter_.SeekToFirst();
prev_ = iter_;
}
void SeekToLast() override {
SkipListRep::LookaheadIterator Summary: This diff introduces the `lookahead` argument to `SkipListFactory()`. This is an optimization for the tailing use case which includes many seeks. E.g. consider the following operations on a skip list iterator: Seek(x), Next(), Next(), Seek(x+2), Next(), Seek(x+3), Next(), Next(), ... If `lookahead` is positive, `SkipListRep` will return an iterator which also keeps track of the previously visited node. Seek() then first does a linear search starting from that node (up to `lookahead` steps). As in the tailing example above, this may require fewer than ~log(n) comparisons as with regular skip list search. Test Plan: Added a new benchmark (`fillseekseq`) which simulates the usage pattern. It first writes N records (with consecutive keys), then measures how much time it takes to read them by calling `Seek()` and `Next()`. $ time ./db_bench -num 10000000 -benchmarks fillseekseq -prefix_size 1 \ -key_size 8 -write_buffer_size $[1024*1024*1024] -value_size 50 \ -seekseq_next 2 -skip_list_lookahead=0 [...] DB path: [/dev/shm/rocksdbtest/dbbench] fillseekseq : 0.389 micros/op 2569047 ops/sec; real 0m21.806s user 0m12.106s sys 0m9.672s $ time ./db_bench [...] -skip_list_lookahead=2 [...] DB path: [/dev/shm/rocksdbtest/dbbench] fillseekseq : 0.153 micros/op 6540684 ops/sec; real 0m19.469s user 0m10.192s sys 0m9.252s Reviewers: ljin, sdong, igor Reviewed By: igor Subscribers: dhruba, leveldb, march, lovro Differential Revision: https://reviews.facebook.net/D23997
2014-09-23 22:52:28 +00:00
iter_.SeekToLast();
prev_ = iter_;
}
protected:
std::string tmp_; // For passing to EncodeKey
SkipListRep::LookaheadIterator Summary: This diff introduces the `lookahead` argument to `SkipListFactory()`. This is an optimization for the tailing use case which includes many seeks. E.g. consider the following operations on a skip list iterator: Seek(x), Next(), Next(), Seek(x+2), Next(), Seek(x+3), Next(), Next(), ... If `lookahead` is positive, `SkipListRep` will return an iterator which also keeps track of the previously visited node. Seek() then first does a linear search starting from that node (up to `lookahead` steps). As in the tailing example above, this may require fewer than ~log(n) comparisons as with regular skip list search. Test Plan: Added a new benchmark (`fillseekseq`) which simulates the usage pattern. It first writes N records (with consecutive keys), then measures how much time it takes to read them by calling `Seek()` and `Next()`. $ time ./db_bench -num 10000000 -benchmarks fillseekseq -prefix_size 1 \ -key_size 8 -write_buffer_size $[1024*1024*1024] -value_size 50 \ -seekseq_next 2 -skip_list_lookahead=0 [...] DB path: [/dev/shm/rocksdbtest/dbbench] fillseekseq : 0.389 micros/op 2569047 ops/sec; real 0m21.806s user 0m12.106s sys 0m9.672s $ time ./db_bench [...] -skip_list_lookahead=2 [...] DB path: [/dev/shm/rocksdbtest/dbbench] fillseekseq : 0.153 micros/op 6540684 ops/sec; real 0m19.469s user 0m10.192s sys 0m9.252s Reviewers: ljin, sdong, igor Reviewed By: igor Subscribers: dhruba, leveldb, march, lovro Differential Revision: https://reviews.facebook.net/D23997
2014-09-23 22:52:28 +00:00
private:
const SkipListRep& rep_;
2015-11-19 22:24:29 +00:00
InlineSkipList<const MemTableRep::KeyComparator&>::Iterator iter_;
InlineSkipList<const MemTableRep::KeyComparator&>::Iterator prev_;
SkipListRep::LookaheadIterator Summary: This diff introduces the `lookahead` argument to `SkipListFactory()`. This is an optimization for the tailing use case which includes many seeks. E.g. consider the following operations on a skip list iterator: Seek(x), Next(), Next(), Seek(x+2), Next(), Seek(x+3), Next(), Next(), ... If `lookahead` is positive, `SkipListRep` will return an iterator which also keeps track of the previously visited node. Seek() then first does a linear search starting from that node (up to `lookahead` steps). As in the tailing example above, this may require fewer than ~log(n) comparisons as with regular skip list search. Test Plan: Added a new benchmark (`fillseekseq`) which simulates the usage pattern. It first writes N records (with consecutive keys), then measures how much time it takes to read them by calling `Seek()` and `Next()`. $ time ./db_bench -num 10000000 -benchmarks fillseekseq -prefix_size 1 \ -key_size 8 -write_buffer_size $[1024*1024*1024] -value_size 50 \ -seekseq_next 2 -skip_list_lookahead=0 [...] DB path: [/dev/shm/rocksdbtest/dbbench] fillseekseq : 0.389 micros/op 2569047 ops/sec; real 0m21.806s user 0m12.106s sys 0m9.672s $ time ./db_bench [...] -skip_list_lookahead=2 [...] DB path: [/dev/shm/rocksdbtest/dbbench] fillseekseq : 0.153 micros/op 6540684 ops/sec; real 0m19.469s user 0m10.192s sys 0m9.252s Reviewers: ljin, sdong, igor Reviewed By: igor Subscribers: dhruba, leveldb, march, lovro Differential Revision: https://reviews.facebook.net/D23997
2014-09-23 22:52:28 +00:00
};
MemTableRep::Iterator* GetIterator(Arena* arena = nullptr) override {
SkipListRep::LookaheadIterator Summary: This diff introduces the `lookahead` argument to `SkipListFactory()`. This is an optimization for the tailing use case which includes many seeks. E.g. consider the following operations on a skip list iterator: Seek(x), Next(), Next(), Seek(x+2), Next(), Seek(x+3), Next(), Next(), ... If `lookahead` is positive, `SkipListRep` will return an iterator which also keeps track of the previously visited node. Seek() then first does a linear search starting from that node (up to `lookahead` steps). As in the tailing example above, this may require fewer than ~log(n) comparisons as with regular skip list search. Test Plan: Added a new benchmark (`fillseekseq`) which simulates the usage pattern. It first writes N records (with consecutive keys), then measures how much time it takes to read them by calling `Seek()` and `Next()`. $ time ./db_bench -num 10000000 -benchmarks fillseekseq -prefix_size 1 \ -key_size 8 -write_buffer_size $[1024*1024*1024] -value_size 50 \ -seekseq_next 2 -skip_list_lookahead=0 [...] DB path: [/dev/shm/rocksdbtest/dbbench] fillseekseq : 0.389 micros/op 2569047 ops/sec; real 0m21.806s user 0m12.106s sys 0m9.672s $ time ./db_bench [...] -skip_list_lookahead=2 [...] DB path: [/dev/shm/rocksdbtest/dbbench] fillseekseq : 0.153 micros/op 6540684 ops/sec; real 0m19.469s user 0m10.192s sys 0m9.252s Reviewers: ljin, sdong, igor Reviewed By: igor Subscribers: dhruba, leveldb, march, lovro Differential Revision: https://reviews.facebook.net/D23997
2014-09-23 22:52:28 +00:00
if (lookahead_ > 0) {
void* mem =
arena ? arena->AllocateAligned(sizeof(SkipListRep::LookaheadIterator))
:
operator new(sizeof(SkipListRep::LookaheadIterator));
SkipListRep::LookaheadIterator Summary: This diff introduces the `lookahead` argument to `SkipListFactory()`. This is an optimization for the tailing use case which includes many seeks. E.g. consider the following operations on a skip list iterator: Seek(x), Next(), Next(), Seek(x+2), Next(), Seek(x+3), Next(), Next(), ... If `lookahead` is positive, `SkipListRep` will return an iterator which also keeps track of the previously visited node. Seek() then first does a linear search starting from that node (up to `lookahead` steps). As in the tailing example above, this may require fewer than ~log(n) comparisons as with regular skip list search. Test Plan: Added a new benchmark (`fillseekseq`) which simulates the usage pattern. It first writes N records (with consecutive keys), then measures how much time it takes to read them by calling `Seek()` and `Next()`. $ time ./db_bench -num 10000000 -benchmarks fillseekseq -prefix_size 1 \ -key_size 8 -write_buffer_size $[1024*1024*1024] -value_size 50 \ -seekseq_next 2 -skip_list_lookahead=0 [...] DB path: [/dev/shm/rocksdbtest/dbbench] fillseekseq : 0.389 micros/op 2569047 ops/sec; real 0m21.806s user 0m12.106s sys 0m9.672s $ time ./db_bench [...] -skip_list_lookahead=2 [...] DB path: [/dev/shm/rocksdbtest/dbbench] fillseekseq : 0.153 micros/op 6540684 ops/sec; real 0m19.469s user 0m10.192s sys 0m9.252s Reviewers: ljin, sdong, igor Reviewed By: igor Subscribers: dhruba, leveldb, march, lovro Differential Revision: https://reviews.facebook.net/D23997
2014-09-23 22:52:28 +00:00
return new (mem) SkipListRep::LookaheadIterator(*this);
} else {
void* mem = arena ? arena->AllocateAligned(sizeof(SkipListRep::Iterator))
:
operator new(sizeof(SkipListRep::Iterator));
return new (mem) SkipListRep::Iterator(&skip_list_);
}
}
};
} // namespace
static std::unordered_map<std::string, OptionTypeInfo> skiplist_factory_info = {
{"lookahead",
{0, OptionType::kSizeT, OptionVerificationType::kNormal,
OptionTypeFlags::kDontSerialize /*Since it is part of the ID*/}},
};
SkipListFactory::SkipListFactory(size_t lookahead) : lookahead_(lookahead) {
RegisterOptions("SkipListFactoryOptions", &lookahead_,
&skiplist_factory_info);
}
std::string SkipListFactory::GetId() const {
std::string id = Name();
if (lookahead_ > 0) {
id.append(":").append(std::to_string(lookahead_));
}
return id;
}
MemTableRep* SkipListFactory::CreateMemTableRep(
const MemTableRep::KeyComparator& compare, Allocator* allocator,
const SliceTransform* transform, Logger* /*logger*/) {
return new SkipListRep(compare, allocator, transform, lookahead_);
}
} // namespace ROCKSDB_NAMESPACE