2016-02-09 23:12:00 +00:00
|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
2017-07-15 23:03:42 +00:00
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
2013-10-16 21:59:46 +00:00
|
|
|
//
|
2021-08-11 01:07:48 +00:00
|
|
|
#include <random>
|
|
|
|
|
2013-07-23 21:42:27 +00:00
|
|
|
#include "db/memtable.h"
|
2019-05-31 00:39:43 +00:00
|
|
|
#include "memory/arena.h"
|
|
|
|
#include "memtable/inlineskiplist.h"
|
InlineSkipList part 3/3 - new skiplist type that colocates key and node
Summary:
This diff completes the creation of InlineSkipList<Cmp>, which is like
SkipList<const char*, Cmp> but it always allocates the key contiguously
with the node. This allows us to remove the pointer from the node
to the key. As a result the memory usage of the skip list is reduced
(by 1 to sizeof(void*) bytes depending on the padding required to align
the key storage), cache locality is improved, and we halve the number
of calls to the allocator.
For skip lists whose keys are freshly-allocated const char*,
InlineSkipList is stricly preferrable to SkipList. This diff doesn't
replace SkipList, however, because some of the use cases of SkipList in
RocksDB are either character sequences that are not allocated at the
same time as the skip list node allocation (for example
hash_linklist_rep) or have different key types (for example
write_batch_with_index). Taking advantage of inline allocation for
those cases is left to future work.
The perf win is biggest for small values. For single-threaded CPU-bound
(32M fillrandom operations with no WAL log) with 16 byte keys and 0 byte
values, the db_bench perf goes from ~310k ops/sec to ~410k ops/sec. For
large values the improvement is less pronounced, but seems to be between
5% and 10% on the same configuration.
Test Plan: make check
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: dhruba
Differential Revision: https://reviews.facebook.net/D51123
2015-11-19 22:24:29 +00:00
|
|
|
#include "rocksdb/memtablerep.h"
|
2021-09-08 14:45:59 +00:00
|
|
|
#include "rocksdb/utilities/options_type.h"
|
|
|
|
#include "util/string_util.h"
|
2013-07-23 21:42:27 +00:00
|
|
|
|
2020-02-20 20:07:53 +00:00
|
|
|
namespace ROCKSDB_NAMESPACE {
|
2013-08-23 06:10:02 +00:00
|
|
|
namespace {
|
2013-07-23 21:42:27 +00:00
|
|
|
class SkipListRep : public MemTableRep {
|
InlineSkipList part 3/3 - new skiplist type that colocates key and node
Summary:
This diff completes the creation of InlineSkipList<Cmp>, which is like
SkipList<const char*, Cmp> but it always allocates the key contiguously
with the node. This allows us to remove the pointer from the node
to the key. As a result the memory usage of the skip list is reduced
(by 1 to sizeof(void*) bytes depending on the padding required to align
the key storage), cache locality is improved, and we halve the number
of calls to the allocator.
For skip lists whose keys are freshly-allocated const char*,
InlineSkipList is stricly preferrable to SkipList. This diff doesn't
replace SkipList, however, because some of the use cases of SkipList in
RocksDB are either character sequences that are not allocated at the
same time as the skip list node allocation (for example
hash_linklist_rep) or have different key types (for example
write_batch_with_index). Taking advantage of inline allocation for
those cases is left to future work.
The perf win is biggest for small values. For single-threaded CPU-bound
(32M fillrandom operations with no WAL log) with 16 byte keys and 0 byte
values, the db_bench perf goes from ~310k ops/sec to ~410k ops/sec. For
large values the improvement is less pronounced, but seems to be between
5% and 10% on the same configuration.
Test Plan: make check
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: dhruba
Differential Revision: https://reviews.facebook.net/D51123
2015-11-19 22:24:29 +00:00
|
|
|
InlineSkipList<const MemTableRep::KeyComparator&> skip_list_;
|
SkipListRep::LookaheadIterator
Summary:
This diff introduces the `lookahead` argument to `SkipListFactory()`. This is an
optimization for the tailing use case which includes many seeks. E.g. consider
the following operations on a skip list iterator:
Seek(x), Next(), Next(), Seek(x+2), Next(), Seek(x+3), Next(), Next(), ...
If `lookahead` is positive, `SkipListRep` will return an iterator which also
keeps track of the previously visited node. Seek() then first does a linear
search starting from that node (up to `lookahead` steps). As in the tailing
example above, this may require fewer than ~log(n) comparisons as with regular
skip list search.
Test Plan:
Added a new benchmark (`fillseekseq`) which simulates the usage pattern. It
first writes N records (with consecutive keys), then measures how much time it
takes to read them by calling `Seek()` and `Next()`.
$ time ./db_bench -num 10000000 -benchmarks fillseekseq -prefix_size 1 \
-key_size 8 -write_buffer_size $[1024*1024*1024] -value_size 50 \
-seekseq_next 2 -skip_list_lookahead=0
[...]
DB path: [/dev/shm/rocksdbtest/dbbench]
fillseekseq : 0.389 micros/op 2569047 ops/sec;
real 0m21.806s
user 0m12.106s
sys 0m9.672s
$ time ./db_bench [...] -skip_list_lookahead=2
[...]
DB path: [/dev/shm/rocksdbtest/dbbench]
fillseekseq : 0.153 micros/op 6540684 ops/sec;
real 0m19.469s
user 0m10.192s
sys 0m9.252s
Reviewers: ljin, sdong, igor
Reviewed By: igor
Subscribers: dhruba, leveldb, march, lovro
Differential Revision: https://reviews.facebook.net/D23997
2014-09-23 22:52:28 +00:00
|
|
|
const MemTableRep::KeyComparator& cmp_;
|
|
|
|
const SliceTransform* transform_;
|
|
|
|
const size_t lookahead_;
|
|
|
|
|
|
|
|
friend class LookaheadIterator;
|
2022-10-28 20:16:50 +00:00
|
|
|
|
|
|
|
public:
|
|
|
|
explicit SkipListRep(const MemTableRep::KeyComparator& compare,
|
|
|
|
Allocator* allocator, const SliceTransform* transform,
|
|
|
|
const size_t lookahead)
|
|
|
|
: MemTableRep(allocator),
|
|
|
|
skip_list_(compare, allocator),
|
|
|
|
cmp_(compare),
|
|
|
|
transform_(transform),
|
|
|
|
lookahead_(lookahead) {}
|
|
|
|
|
|
|
|
KeyHandle Allocate(const size_t len, char** buf) override {
|
|
|
|
*buf = skip_list_.AllocateKey(len);
|
|
|
|
return static_cast<KeyHandle>(*buf);
|
|
|
|
}
|
InlineSkipList part 3/3 - new skiplist type that colocates key and node
Summary:
This diff completes the creation of InlineSkipList<Cmp>, which is like
SkipList<const char*, Cmp> but it always allocates the key contiguously
with the node. This allows us to remove the pointer from the node
to the key. As a result the memory usage of the skip list is reduced
(by 1 to sizeof(void*) bytes depending on the padding required to align
the key storage), cache locality is improved, and we halve the number
of calls to the allocator.
For skip lists whose keys are freshly-allocated const char*,
InlineSkipList is stricly preferrable to SkipList. This diff doesn't
replace SkipList, however, because some of the use cases of SkipList in
RocksDB are either character sequences that are not allocated at the
same time as the skip list node allocation (for example
hash_linklist_rep) or have different key types (for example
write_batch_with_index). Taking advantage of inline allocation for
those cases is left to future work.
The perf win is biggest for small values. For single-threaded CPU-bound
(32M fillrandom operations with no WAL log) with 16 byte keys and 0 byte
values, the db_bench perf goes from ~310k ops/sec to ~410k ops/sec. For
large values the improvement is less pronounced, but seems to be between
5% and 10% on the same configuration.
Test Plan: make check
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: dhruba
Differential Revision: https://reviews.facebook.net/D51123
2015-11-19 22:24:29 +00:00
|
|
|
|
2013-07-23 21:42:27 +00:00
|
|
|
// Insert key into the list.
|
|
|
|
// REQUIRES: nothing that compares equal to key is currently in the list.
|
2022-10-28 20:16:50 +00:00
|
|
|
void Insert(KeyHandle handle) override {
|
|
|
|
skip_list_.Insert(static_cast<char*>(handle));
|
|
|
|
}
|
2018-02-16 01:12:48 +00:00
|
|
|
|
2022-10-28 20:16:50 +00:00
|
|
|
bool InsertKey(KeyHandle handle) override {
|
|
|
|
return skip_list_.Insert(static_cast<char*>(handle));
|
|
|
|
}
|
2013-07-23 21:42:27 +00:00
|
|
|
|
2022-10-28 20:16:50 +00:00
|
|
|
void InsertWithHint(KeyHandle handle, void** hint) override {
|
|
|
|
skip_list_.InsertWithHint(static_cast<char*>(handle), hint);
|
|
|
|
}
|
2018-02-16 01:12:48 +00:00
|
|
|
|
2022-10-28 20:16:50 +00:00
|
|
|
bool InsertKeyWithHint(KeyHandle handle, void** hint) override {
|
|
|
|
return skip_list_.InsertWithHint(static_cast<char*>(handle), hint);
|
|
|
|
}
|
2016-11-14 02:58:17 +00:00
|
|
|
|
2022-10-28 20:16:50 +00:00
|
|
|
void InsertWithHintConcurrently(KeyHandle handle, void** hint) override {
|
|
|
|
skip_list_.InsertWithHintConcurrently(static_cast<char*>(handle), hint);
|
|
|
|
}
|
2019-09-12 23:53:31 +00:00
|
|
|
|
2022-10-28 20:16:50 +00:00
|
|
|
bool InsertKeyWithHintConcurrently(KeyHandle handle, void** hint) override {
|
|
|
|
return skip_list_.InsertWithHintConcurrently(static_cast<char*>(handle),
|
|
|
|
hint);
|
|
|
|
}
|
2019-09-12 23:53:31 +00:00
|
|
|
|
2022-10-28 20:16:50 +00:00
|
|
|
void InsertConcurrently(KeyHandle handle) override {
|
|
|
|
skip_list_.InsertConcurrently(static_cast<char*>(handle));
|
|
|
|
}
|
2018-02-16 01:12:48 +00:00
|
|
|
|
2022-10-28 20:16:50 +00:00
|
|
|
bool InsertKeyConcurrently(KeyHandle handle) override {
|
|
|
|
return skip_list_.InsertConcurrently(static_cast<char*>(handle));
|
|
|
|
}
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-14 23:59:07 +00:00
|
|
|
|
2013-07-23 21:42:27 +00:00
|
|
|
// Returns true iff an entry that compares equal to key is in the list.
|
2022-10-28 20:16:50 +00:00
|
|
|
bool Contains(const char* key) const override {
|
|
|
|
return skip_list_.Contains(key);
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t ApproximateMemoryUsage() override {
|
|
|
|
// All memory is allocated through allocator; nothing to report here
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
void Get(const LookupKey& k, void* callback_args,
|
|
|
|
bool (*callback_func)(void* arg, const char* entry)) override {
|
|
|
|
SkipListRep::Iterator iter(&skip_list_);
|
|
|
|
Slice dummy_slice;
|
|
|
|
for (iter.Seek(dummy_slice, k.memtable_key().data());
|
|
|
|
iter.Valid() && callback_func(callback_args, iter.key());
|
|
|
|
iter.Next()) {
|
|
|
|
}
|
|
|
|
}
|
2014-02-11 17:46:30 +00:00
|
|
|
|
2024-08-19 20:53:25 +00:00
|
|
|
Status GetAndValidate(const LookupKey& k, void* callback_args,
|
|
|
|
bool (*callback_func)(void* arg, const char* entry),
|
|
|
|
bool allow_data_in_errors) override {
|
|
|
|
SkipListRep::Iterator iter(&skip_list_);
|
|
|
|
Slice dummy_slice;
|
|
|
|
Status status = iter.SeekAndValidate(dummy_slice, k.memtable_key().data(),
|
|
|
|
allow_data_in_errors);
|
|
|
|
for (; iter.Valid() && status.ok() &&
|
|
|
|
callback_func(callback_args, iter.key());
|
|
|
|
status = iter.NextAndValidate(allow_data_in_errors)) {
|
|
|
|
}
|
|
|
|
return status;
|
|
|
|
}
|
|
|
|
|
2015-06-13 01:04:30 +00:00
|
|
|
uint64_t ApproximateNumEntries(const Slice& start_ikey,
|
|
|
|
const Slice& end_ikey) override {
|
Re-implement GetApproximateMemTableStats for skip lists (#13047)
Summary:
GetApproximateMemTableStats() could return some bad results with the standard skip list memtable. See this new db_bench test showing the dismal distribution of results when the actual number of entries in range is 1000:
```
$ ./db_bench --benchmarks=filluniquerandom,approximatememtablestats,readrandom --value_size=1 --num=1000000 --batch_size=1000
...
filluniquerandom : 1.391 micros/op 718915 ops/sec 1.391 seconds 1000000 operations; 11.7 MB/s
approximatememtablestats : 3.711 micros/op 269492 ops/sec 3.711 seconds 1000000 operations;
Reported entry count stats (expected 1000):
Count: 1000000 Average: 2344.1611 StdDev: 26587.27
Min: 0 Median: 965.8555 Max: 835273
Percentiles: P50: 965.86 P75: 1610.77 P99: 12618.01 P99.9: 74991.58 P99.99: 830970.97
------------------------------------------------------
[ 0, 1 ] 131344 13.134% 13.134% ###
( 1, 2 ] 115 0.011% 13.146%
( 2, 3 ] 106 0.011% 13.157%
( 3, 4 ] 190 0.019% 13.176%
( 4, 6 ] 214 0.021% 13.197%
( 6, 10 ] 522 0.052% 13.249%
( 10, 15 ] 748 0.075% 13.324%
( 15, 22 ] 1002 0.100% 13.424%
( 22, 34 ] 1948 0.195% 13.619%
( 34, 51 ] 3067 0.307% 13.926%
( 51, 76 ] 4213 0.421% 14.347%
( 76, 110 ] 5721 0.572% 14.919%
( 110, 170 ] 11375 1.137% 16.056%
( 170, 250 ] 17928 1.793% 17.849%
( 250, 380 ] 36597 3.660% 21.509% #
( 380, 580 ] 77882 7.788% 29.297% ##
( 580, 870 ] 160193 16.019% 45.317% ###
( 870, 1300 ] 210098 21.010% 66.326% ####
( 1300, 1900 ] 167461 16.746% 83.072% ###
( 1900, 2900 ] 78678 7.868% 90.940% ##
( 2900, 4400 ] 47743 4.774% 95.715% #
( 4400, 6600 ] 17650 1.765% 97.480%
( 6600, 9900 ] 11895 1.190% 98.669%
( 9900, 14000 ] 4993 0.499% 99.168%
( 14000, 22000 ] 2384 0.238% 99.407%
( 22000, 33000 ] 1966 0.197% 99.603%
( 50000, 75000 ] 2968 0.297% 99.900%
( 570000, 860000 ] 999 0.100% 100.000%
readrandom : 1.967 micros/op 508487 ops/sec 1.967 seconds 1000000 operations; 8.2 MB/s (1000000 of 1000000 found)
```
Perhaps the only good thing to say about the old implementation was that it was fast, though apparently not that fast.
I've implemented a much more robust and reasonably fast new version of the function. It's still logarithmic but with some larger constant factors. The standard deviation from true count is around 20% or less, and roughly the CPU cost of two memtable point look-ups. See code comments for detail.
```
$ ./db_bench --benchmarks=filluniquerandom,approximatememtablestats,readrandom --value_size=1 --num=1000000 --batch_size=1000
...
filluniquerandom : 1.478 micros/op 676434 ops/sec 1.478 seconds 1000000 operations; 11.0 MB/s
approximatememtablestats : 2.694 micros/op 371157 ops/sec 2.694 seconds 1000000 operations;
Reported entry count stats (expected 1000):
Count: 1000000 Average: 1073.5158 StdDev: 197.80
Min: 608 Median: 1079.9506 Max: 2176
Percentiles: P50: 1079.95 P75: 1223.69 P99: 1852.36 P99.9: 1898.70 P99.99: 2176.00
------------------------------------------------------
( 580, 870 ] 134848 13.485% 13.485% ###
( 870, 1300 ] 747868 74.787% 88.272% ###############
( 1300, 1900 ] 116536 11.654% 99.925% ##
( 1900, 2900 ] 748 0.075% 100.000%
readrandom : 1.997 micros/op 500654 ops/sec 1.997 seconds 1000000 operations; 8.1 MB/s (1000000 of 1000000 found)
```
We can already see that the distribution of results is dramatically better and wonderfully normal-looking, with relative standard deviation around 20%. The function is also FASTER, at least with these parameters. Let's look how this behavior generalizes, first *much* larger range:
```
$ ./db_bench --benchmarks=filluniquerandom,approximatememtablestats,readrandom --value_size=1 --num=1000000 --batch_size=30000
filluniquerandom : 1.390 micros/op 719654 ops/sec 1.376 seconds 990000 operations; 11.7 MB/s
approximatememtablestats : 1.129 micros/op 885649 ops/sec 1.129 seconds 1000000 operations;
Reported entry count stats (expected 30000):
Count: 1000000 Average: 31098.8795 StdDev: 3601.47
Min: 21504 Median: 29333.9303 Max: 43008
Percentiles: P50: 29333.93 P75: 33018.00 P99: 43008.00 P99.9: 43008.00 P99.99: 43008.00
------------------------------------------------------
( 14000, 22000 ] 408 0.041% 0.041%
( 22000, 33000 ] 749327 74.933% 74.974% ###############
( 33000, 50000 ] 250265 25.027% 100.000% #####
readrandom : 1.894 micros/op 528083 ops/sec 1.894 seconds 1000000 operations; 8.5 MB/s (989989 of 1000000 found)
```
This is *even faster* and relatively *more accurate*, with relative standard deviation closer to 10%. Code comments explain why. Now let's look at smaller ranges. Implementation quirks or conveniences:
* When actual number in range is >= 40, the minimum return value is 40.
* When the actual is <= 10, it is guaranteed to return that actual number.
```
$ ./db_bench --benchmarks=filluniquerandom,approximatememtablestats,readrandom --value_size=1 --num=1000000 --batch_size=75
...
filluniquerandom : 1.417 micros/op 705668 ops/sec 1.417 seconds 999975 operations; 11.4 MB/s
approximatememtablestats : 3.342 micros/op 299197 ops/sec 3.342 seconds 1000000 operations;
Reported entry count stats (expected 75):
Count: 1000000 Average: 75.1210 StdDev: 15.02
Min: 40 Median: 71.9395 Max: 256
Percentiles: P50: 71.94 P75: 89.69 P99: 119.12 P99.9: 166.68 P99.99: 229.78
------------------------------------------------------
( 34, 51 ] 38867 3.887% 3.887% #
( 51, 76 ] 550554 55.055% 58.942% ###########
( 76, 110 ] 398854 39.885% 98.828% ########
( 110, 170 ] 11353 1.135% 99.963%
( 170, 250 ] 364 0.036% 99.999%
( 250, 380 ] 8 0.001% 100.000%
readrandom : 1.861 micros/op 537224 ops/sec 1.861 seconds 1000000 operations; 8.7 MB/s (999974 of 1000000 found)
$ ./db_bench --benchmarks=filluniquerandom,approximatememtablestats,readrandom --value_size=1 --num=1000000 --batch_size=25
...
filluniquerandom : 1.501 micros/op 666283 ops/sec 1.501 seconds 1000000 operations; 10.8 MB/s
approximatememtablestats : 5.118 micros/op 195401 ops/sec 5.118 seconds 1000000 operations;
Reported entry count stats (expected 25):
Count: 1000000 Average: 26.2392 StdDev: 4.58
Min: 25 Median: 28.4590 Max: 72
Percentiles: P50: 28.46 P75: 31.69 P99: 49.27 P99.9: 67.95 P99.99: 72.00
------------------------------------------------------
( 22, 34 ] 928936 92.894% 92.894% ###################
( 34, 51 ] 67960 6.796% 99.690% #
( 51, 76 ] 3104 0.310% 100.000%
readrandom : 1.892 micros/op 528595 ops/sec 1.892 seconds 1000000 operations; 8.6 MB/s (1000000 of 1000000 found)
$ ./db_bench --benchmarks=filluniquerandom,approximatememtablestats,readrandom --value_size=1 --num=1000000 --batch_size=10
...
filluniquerandom : 1.642 micros/op 608916 ops/sec 1.642 seconds 1000000 operations; 9.9 MB/s
approximatememtablestats : 3.042 micros/op 328721 ops/sec 3.042 seconds 1000000 operations;
Reported entry count stats (expected 10):
Count: 1000000 Average: 10.0000 StdDev: 0.00
Min: 10 Median: 10.0000 Max: 10
Percentiles: P50: 10.00 P75: 10.00 P99: 10.00 P99.9: 10.00 P99.99: 10.00
------------------------------------------------------
( 6, 10 ] 1000000 100.000% 100.000% ####################
readrandom : 1.805 micros/op 554126 ops/sec 1.805 seconds 1000000 operations; 9.0 MB/s (1000000 of 1000000 found)
```
Remarkably consistent.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/13047
Test Plan: new db_bench test for both performance and accuracy (see above); added to crash test; unit test updated.
Reviewed By: cbi42
Differential Revision: D63722003
Pulled By: pdillinger
fbshipit-source-id: cfc8613c085e87c17ecec22d82601aac2a5a1b26
2024-10-02 21:25:50 +00:00
|
|
|
return skip_list_.ApproximateNumEntries(start_ikey, end_ikey);
|
2015-06-13 01:04:30 +00:00
|
|
|
}
|
|
|
|
|
2021-08-13 21:34:43 +00:00
|
|
|
void UniqueRandomSample(const uint64_t num_entries,
|
|
|
|
const uint64_t target_sample_size,
|
2021-08-11 01:07:48 +00:00
|
|
|
std::unordered_set<const char*>* entries) override {
|
|
|
|
entries->clear();
|
|
|
|
// Avoid divide-by-0.
|
|
|
|
assert(target_sample_size > 0);
|
|
|
|
assert(num_entries > 0);
|
|
|
|
// NOTE: the size of entries is not enforced to be exactly
|
|
|
|
// target_sample_size at the end of this function, it might be slightly
|
|
|
|
// greater or smaller.
|
|
|
|
SkipListRep::Iterator iter(&skip_list_);
|
|
|
|
// There are two methods to create the subset of samples (size m)
|
|
|
|
// from the table containing N elements:
|
|
|
|
// 1-Iterate linearly through the N memtable entries. For each entry i,
|
|
|
|
// add it to the sample set with a probability
|
|
|
|
// (target_sample_size - entries.size() ) / (N-i).
|
|
|
|
//
|
|
|
|
// 2-Pick m random elements without repetition.
|
|
|
|
// We pick Option 2 when m<sqrt(N) and
|
|
|
|
// Option 1 when m > sqrt(N).
|
|
|
|
if (target_sample_size >
|
|
|
|
static_cast<uint64_t>(std::sqrt(1.0 * num_entries))) {
|
|
|
|
Random* rnd = Random::GetTLSInstance();
|
|
|
|
iter.SeekToFirst();
|
|
|
|
uint64_t counter = 0, num_samples_left = target_sample_size;
|
|
|
|
for (; iter.Valid() && (num_samples_left > 0); iter.Next(), counter++) {
|
|
|
|
// Add entry to sample set with probability
|
|
|
|
// num_samples_left/(num_entries - counter).
|
|
|
|
if (rnd->Next() % (num_entries - counter) < num_samples_left) {
|
|
|
|
entries->insert(iter.key());
|
|
|
|
num_samples_left--;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// Option 2: pick m random elements with no duplicates.
|
|
|
|
// If Option 2 is picked, then target_sample_size<sqrt(N)
|
|
|
|
// Using a set spares the need to check for duplicates.
|
|
|
|
for (uint64_t i = 0; i < target_sample_size; i++) {
|
|
|
|
// We give it 5 attempts to find a non-duplicate
|
|
|
|
// With 5 attempts, the chances of returning `entries` set
|
|
|
|
// of size target_sample_size is:
|
|
|
|
// PROD_{i=1}^{target_sample_size-1} [1-(i/N)^5]
|
|
|
|
// which is monotonically increasing with N in the worse case
|
|
|
|
// of target_sample_size=sqrt(N), and is always >99.9% for N>4.
|
|
|
|
// At worst, for the final pick , when m=sqrt(N) there is
|
|
|
|
// a probability of p= 1/sqrt(N) chances to find a duplicate.
|
|
|
|
for (uint64_t j = 0; j < 5; j++) {
|
|
|
|
iter.RandomSeek();
|
|
|
|
// unordered_set::insert returns pair<iterator, bool>.
|
|
|
|
// The second element is true if an insert successfully happened.
|
|
|
|
// If element is already in the set, this bool will be false, and
|
|
|
|
// true otherwise.
|
|
|
|
if ((entries->insert(iter.key())).second) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-12-01 19:10:30 +00:00
|
|
|
~SkipListRep() override = default;
|
2013-07-23 21:42:27 +00:00
|
|
|
|
|
|
|
// Iteration over the contents of a skip list
|
|
|
|
class Iterator : public MemTableRep::Iterator {
|
InlineSkipList part 3/3 - new skiplist type that colocates key and node
Summary:
This diff completes the creation of InlineSkipList<Cmp>, which is like
SkipList<const char*, Cmp> but it always allocates the key contiguously
with the node. This allows us to remove the pointer from the node
to the key. As a result the memory usage of the skip list is reduced
(by 1 to sizeof(void*) bytes depending on the padding required to align
the key storage), cache locality is improved, and we halve the number
of calls to the allocator.
For skip lists whose keys are freshly-allocated const char*,
InlineSkipList is stricly preferrable to SkipList. This diff doesn't
replace SkipList, however, because some of the use cases of SkipList in
RocksDB are either character sequences that are not allocated at the
same time as the skip list node allocation (for example
hash_linklist_rep) or have different key types (for example
write_batch_with_index). Taking advantage of inline allocation for
those cases is left to future work.
The perf win is biggest for small values. For single-threaded CPU-bound
(32M fillrandom operations with no WAL log) with 16 byte keys and 0 byte
values, the db_bench perf goes from ~310k ops/sec to ~410k ops/sec. For
large values the improvement is less pronounced, but seems to be between
5% and 10% on the same configuration.
Test Plan: make check
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: dhruba
Differential Revision: https://reviews.facebook.net/D51123
2015-11-19 22:24:29 +00:00
|
|
|
InlineSkipList<const MemTableRep::KeyComparator&>::Iterator iter_;
|
|
|
|
|
2013-07-23 21:42:27 +00:00
|
|
|
public:
|
|
|
|
// Initialize an iterator over the specified list.
|
|
|
|
// The returned iterator is not valid.
|
|
|
|
explicit Iterator(
|
InlineSkipList part 3/3 - new skiplist type that colocates key and node
Summary:
This diff completes the creation of InlineSkipList<Cmp>, which is like
SkipList<const char*, Cmp> but it always allocates the key contiguously
with the node. This allows us to remove the pointer from the node
to the key. As a result the memory usage of the skip list is reduced
(by 1 to sizeof(void*) bytes depending on the padding required to align
the key storage), cache locality is improved, and we halve the number
of calls to the allocator.
For skip lists whose keys are freshly-allocated const char*,
InlineSkipList is stricly preferrable to SkipList. This diff doesn't
replace SkipList, however, because some of the use cases of SkipList in
RocksDB are either character sequences that are not allocated at the
same time as the skip list node allocation (for example
hash_linklist_rep) or have different key types (for example
write_batch_with_index). Taking advantage of inline allocation for
those cases is left to future work.
The perf win is biggest for small values. For single-threaded CPU-bound
(32M fillrandom operations with no WAL log) with 16 byte keys and 0 byte
values, the db_bench perf goes from ~310k ops/sec to ~410k ops/sec. For
large values the improvement is less pronounced, but seems to be between
5% and 10% on the same configuration.
Test Plan: make check
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: dhruba
Differential Revision: https://reviews.facebook.net/D51123
2015-11-19 22:24:29 +00:00
|
|
|
const InlineSkipList<const MemTableRep::KeyComparator&>* list)
|
|
|
|
: iter_(list) {}
|
2013-07-23 21:42:27 +00:00
|
|
|
|
2023-12-01 19:10:30 +00:00
|
|
|
~Iterator() override = default;
|
2013-07-23 21:42:27 +00:00
|
|
|
|
|
|
|
// Returns true iff the iterator is positioned at a valid node.
|
2019-02-14 21:52:47 +00:00
|
|
|
bool Valid() const override { return iter_.Valid(); }
|
2013-07-23 21:42:27 +00:00
|
|
|
|
|
|
|
// Returns the key at the current position.
|
|
|
|
// REQUIRES: Valid()
|
2024-08-19 20:53:25 +00:00
|
|
|
const char* key() const override {
|
|
|
|
assert(Valid());
|
|
|
|
return iter_.key();
|
|
|
|
}
|
2013-07-23 21:42:27 +00:00
|
|
|
|
|
|
|
// Advances to the next position.
|
|
|
|
// REQUIRES: Valid()
|
2024-08-19 20:53:25 +00:00
|
|
|
void Next() override {
|
|
|
|
assert(Valid());
|
|
|
|
iter_.Next();
|
|
|
|
}
|
2013-07-23 21:42:27 +00:00
|
|
|
|
|
|
|
// Advances to the previous position.
|
|
|
|
// REQUIRES: Valid()
|
2024-08-19 20:53:25 +00:00
|
|
|
void Prev() override {
|
|
|
|
assert(Valid());
|
|
|
|
iter_.Prev();
|
|
|
|
}
|
2013-07-23 21:42:27 +00:00
|
|
|
|
|
|
|
// Advance to the first entry with a key >= target
|
2019-02-14 21:52:47 +00:00
|
|
|
void Seek(const Slice& user_key, const char* memtable_key) override {
|
2013-11-21 03:49:27 +00:00
|
|
|
if (memtable_key != nullptr) {
|
|
|
|
iter_.Seek(memtable_key);
|
|
|
|
} else {
|
|
|
|
iter_.Seek(EncodeKey(&tmp_, user_key));
|
|
|
|
}
|
2013-07-23 21:42:27 +00:00
|
|
|
}
|
|
|
|
|
2016-09-28 01:20:57 +00:00
|
|
|
// Retreat to the last entry with a key <= target
|
2019-02-14 21:52:47 +00:00
|
|
|
void SeekForPrev(const Slice& user_key, const char* memtable_key) override {
|
2016-09-28 01:20:57 +00:00
|
|
|
if (memtable_key != nullptr) {
|
|
|
|
iter_.SeekForPrev(memtable_key);
|
|
|
|
} else {
|
|
|
|
iter_.SeekForPrev(EncodeKey(&tmp_, user_key));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-08-11 01:07:48 +00:00
|
|
|
void RandomSeek() override { iter_.RandomSeek(); }
|
|
|
|
|
2013-07-23 21:42:27 +00:00
|
|
|
// Position at the first entry in list.
|
|
|
|
// Final state of iterator is Valid() iff list is not empty.
|
2019-02-14 21:52:47 +00:00
|
|
|
void SeekToFirst() override { iter_.SeekToFirst(); }
|
2013-07-23 21:42:27 +00:00
|
|
|
|
|
|
|
// Position at the last entry in list.
|
|
|
|
// Final state of iterator is Valid() iff list is not empty.
|
2019-02-14 21:52:47 +00:00
|
|
|
void SeekToLast() override { iter_.SeekToLast(); }
|
|
|
|
|
2024-08-19 20:53:25 +00:00
|
|
|
Status NextAndValidate(bool allow_data_in_errors) override {
|
|
|
|
assert(Valid());
|
|
|
|
return iter_.NextAndValidate(allow_data_in_errors);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status SeekAndValidate(const Slice& user_key, const char* memtable_key,
|
|
|
|
bool allow_data_in_errors) override {
|
|
|
|
if (memtable_key != nullptr) {
|
|
|
|
return iter_.SeekAndValidate(memtable_key, allow_data_in_errors);
|
|
|
|
} else {
|
|
|
|
return iter_.SeekAndValidate(EncodeKey(&tmp_, user_key),
|
|
|
|
allow_data_in_errors);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Status PrevAndValidate(bool allow_data_in_error) override {
|
|
|
|
assert(Valid());
|
|
|
|
return iter_.PrevAndValidate(allow_data_in_error);
|
|
|
|
}
|
|
|
|
|
2013-11-21 03:49:27 +00:00
|
|
|
protected:
|
2022-10-28 20:16:50 +00:00
|
|
|
std::string tmp_; // For passing to EncodeKey
|
2013-07-23 21:42:27 +00:00
|
|
|
};
|
|
|
|
|
SkipListRep::LookaheadIterator
Summary:
This diff introduces the `lookahead` argument to `SkipListFactory()`. This is an
optimization for the tailing use case which includes many seeks. E.g. consider
the following operations on a skip list iterator:
Seek(x), Next(), Next(), Seek(x+2), Next(), Seek(x+3), Next(), Next(), ...
If `lookahead` is positive, `SkipListRep` will return an iterator which also
keeps track of the previously visited node. Seek() then first does a linear
search starting from that node (up to `lookahead` steps). As in the tailing
example above, this may require fewer than ~log(n) comparisons as with regular
skip list search.
Test Plan:
Added a new benchmark (`fillseekseq`) which simulates the usage pattern. It
first writes N records (with consecutive keys), then measures how much time it
takes to read them by calling `Seek()` and `Next()`.
$ time ./db_bench -num 10000000 -benchmarks fillseekseq -prefix_size 1 \
-key_size 8 -write_buffer_size $[1024*1024*1024] -value_size 50 \
-seekseq_next 2 -skip_list_lookahead=0
[...]
DB path: [/dev/shm/rocksdbtest/dbbench]
fillseekseq : 0.389 micros/op 2569047 ops/sec;
real 0m21.806s
user 0m12.106s
sys 0m9.672s
$ time ./db_bench [...] -skip_list_lookahead=2
[...]
DB path: [/dev/shm/rocksdbtest/dbbench]
fillseekseq : 0.153 micros/op 6540684 ops/sec;
real 0m19.469s
user 0m10.192s
sys 0m9.252s
Reviewers: ljin, sdong, igor
Reviewed By: igor
Subscribers: dhruba, leveldb, march, lovro
Differential Revision: https://reviews.facebook.net/D23997
2014-09-23 22:52:28 +00:00
|
|
|
// Iterator over the contents of a skip list which also keeps track of the
|
|
|
|
// previously visited node. In Seek(), it examines a few nodes after it
|
|
|
|
// first, falling back to O(log n) search from the head of the list only if
|
|
|
|
// the target key hasn't been found.
|
|
|
|
class LookaheadIterator : public MemTableRep::Iterator {
|
|
|
|
public:
|
2022-10-28 20:16:50 +00:00
|
|
|
explicit LookaheadIterator(const SkipListRep& rep)
|
|
|
|
: rep_(rep), iter_(&rep_.skip_list_), prev_(iter_) {}
|
SkipListRep::LookaheadIterator
Summary:
This diff introduces the `lookahead` argument to `SkipListFactory()`. This is an
optimization for the tailing use case which includes many seeks. E.g. consider
the following operations on a skip list iterator:
Seek(x), Next(), Next(), Seek(x+2), Next(), Seek(x+3), Next(), Next(), ...
If `lookahead` is positive, `SkipListRep` will return an iterator which also
keeps track of the previously visited node. Seek() then first does a linear
search starting from that node (up to `lookahead` steps). As in the tailing
example above, this may require fewer than ~log(n) comparisons as with regular
skip list search.
Test Plan:
Added a new benchmark (`fillseekseq`) which simulates the usage pattern. It
first writes N records (with consecutive keys), then measures how much time it
takes to read them by calling `Seek()` and `Next()`.
$ time ./db_bench -num 10000000 -benchmarks fillseekseq -prefix_size 1 \
-key_size 8 -write_buffer_size $[1024*1024*1024] -value_size 50 \
-seekseq_next 2 -skip_list_lookahead=0
[...]
DB path: [/dev/shm/rocksdbtest/dbbench]
fillseekseq : 0.389 micros/op 2569047 ops/sec;
real 0m21.806s
user 0m12.106s
sys 0m9.672s
$ time ./db_bench [...] -skip_list_lookahead=2
[...]
DB path: [/dev/shm/rocksdbtest/dbbench]
fillseekseq : 0.153 micros/op 6540684 ops/sec;
real 0m19.469s
user 0m10.192s
sys 0m9.252s
Reviewers: ljin, sdong, igor
Reviewed By: igor
Subscribers: dhruba, leveldb, march, lovro
Differential Revision: https://reviews.facebook.net/D23997
2014-09-23 22:52:28 +00:00
|
|
|
|
2023-12-01 19:10:30 +00:00
|
|
|
~LookaheadIterator() override = default;
|
SkipListRep::LookaheadIterator
Summary:
This diff introduces the `lookahead` argument to `SkipListFactory()`. This is an
optimization for the tailing use case which includes many seeks. E.g. consider
the following operations on a skip list iterator:
Seek(x), Next(), Next(), Seek(x+2), Next(), Seek(x+3), Next(), Next(), ...
If `lookahead` is positive, `SkipListRep` will return an iterator which also
keeps track of the previously visited node. Seek() then first does a linear
search starting from that node (up to `lookahead` steps). As in the tailing
example above, this may require fewer than ~log(n) comparisons as with regular
skip list search.
Test Plan:
Added a new benchmark (`fillseekseq`) which simulates the usage pattern. It
first writes N records (with consecutive keys), then measures how much time it
takes to read them by calling `Seek()` and `Next()`.
$ time ./db_bench -num 10000000 -benchmarks fillseekseq -prefix_size 1 \
-key_size 8 -write_buffer_size $[1024*1024*1024] -value_size 50 \
-seekseq_next 2 -skip_list_lookahead=0
[...]
DB path: [/dev/shm/rocksdbtest/dbbench]
fillseekseq : 0.389 micros/op 2569047 ops/sec;
real 0m21.806s
user 0m12.106s
sys 0m9.672s
$ time ./db_bench [...] -skip_list_lookahead=2
[...]
DB path: [/dev/shm/rocksdbtest/dbbench]
fillseekseq : 0.153 micros/op 6540684 ops/sec;
real 0m19.469s
user 0m10.192s
sys 0m9.252s
Reviewers: ljin, sdong, igor
Reviewed By: igor
Subscribers: dhruba, leveldb, march, lovro
Differential Revision: https://reviews.facebook.net/D23997
2014-09-23 22:52:28 +00:00
|
|
|
|
2019-02-14 21:52:47 +00:00
|
|
|
bool Valid() const override { return iter_.Valid(); }
|
SkipListRep::LookaheadIterator
Summary:
This diff introduces the `lookahead` argument to `SkipListFactory()`. This is an
optimization for the tailing use case which includes many seeks. E.g. consider
the following operations on a skip list iterator:
Seek(x), Next(), Next(), Seek(x+2), Next(), Seek(x+3), Next(), Next(), ...
If `lookahead` is positive, `SkipListRep` will return an iterator which also
keeps track of the previously visited node. Seek() then first does a linear
search starting from that node (up to `lookahead` steps). As in the tailing
example above, this may require fewer than ~log(n) comparisons as with regular
skip list search.
Test Plan:
Added a new benchmark (`fillseekseq`) which simulates the usage pattern. It
first writes N records (with consecutive keys), then measures how much time it
takes to read them by calling `Seek()` and `Next()`.
$ time ./db_bench -num 10000000 -benchmarks fillseekseq -prefix_size 1 \
-key_size 8 -write_buffer_size $[1024*1024*1024] -value_size 50 \
-seekseq_next 2 -skip_list_lookahead=0
[...]
DB path: [/dev/shm/rocksdbtest/dbbench]
fillseekseq : 0.389 micros/op 2569047 ops/sec;
real 0m21.806s
user 0m12.106s
sys 0m9.672s
$ time ./db_bench [...] -skip_list_lookahead=2
[...]
DB path: [/dev/shm/rocksdbtest/dbbench]
fillseekseq : 0.153 micros/op 6540684 ops/sec;
real 0m19.469s
user 0m10.192s
sys 0m9.252s
Reviewers: ljin, sdong, igor
Reviewed By: igor
Subscribers: dhruba, leveldb, march, lovro
Differential Revision: https://reviews.facebook.net/D23997
2014-09-23 22:52:28 +00:00
|
|
|
|
2019-02-14 21:52:47 +00:00
|
|
|
const char* key() const override {
|
SkipListRep::LookaheadIterator
Summary:
This diff introduces the `lookahead` argument to `SkipListFactory()`. This is an
optimization for the tailing use case which includes many seeks. E.g. consider
the following operations on a skip list iterator:
Seek(x), Next(), Next(), Seek(x+2), Next(), Seek(x+3), Next(), Next(), ...
If `lookahead` is positive, `SkipListRep` will return an iterator which also
keeps track of the previously visited node. Seek() then first does a linear
search starting from that node (up to `lookahead` steps). As in the tailing
example above, this may require fewer than ~log(n) comparisons as with regular
skip list search.
Test Plan:
Added a new benchmark (`fillseekseq`) which simulates the usage pattern. It
first writes N records (with consecutive keys), then measures how much time it
takes to read them by calling `Seek()` and `Next()`.
$ time ./db_bench -num 10000000 -benchmarks fillseekseq -prefix_size 1 \
-key_size 8 -write_buffer_size $[1024*1024*1024] -value_size 50 \
-seekseq_next 2 -skip_list_lookahead=0
[...]
DB path: [/dev/shm/rocksdbtest/dbbench]
fillseekseq : 0.389 micros/op 2569047 ops/sec;
real 0m21.806s
user 0m12.106s
sys 0m9.672s
$ time ./db_bench [...] -skip_list_lookahead=2
[...]
DB path: [/dev/shm/rocksdbtest/dbbench]
fillseekseq : 0.153 micros/op 6540684 ops/sec;
real 0m19.469s
user 0m10.192s
sys 0m9.252s
Reviewers: ljin, sdong, igor
Reviewed By: igor
Subscribers: dhruba, leveldb, march, lovro
Differential Revision: https://reviews.facebook.net/D23997
2014-09-23 22:52:28 +00:00
|
|
|
assert(Valid());
|
|
|
|
return iter_.key();
|
|
|
|
}
|
|
|
|
|
2019-02-14 21:52:47 +00:00
|
|
|
void Next() override {
|
SkipListRep::LookaheadIterator
Summary:
This diff introduces the `lookahead` argument to `SkipListFactory()`. This is an
optimization for the tailing use case which includes many seeks. E.g. consider
the following operations on a skip list iterator:
Seek(x), Next(), Next(), Seek(x+2), Next(), Seek(x+3), Next(), Next(), ...
If `lookahead` is positive, `SkipListRep` will return an iterator which also
keeps track of the previously visited node. Seek() then first does a linear
search starting from that node (up to `lookahead` steps). As in the tailing
example above, this may require fewer than ~log(n) comparisons as with regular
skip list search.
Test Plan:
Added a new benchmark (`fillseekseq`) which simulates the usage pattern. It
first writes N records (with consecutive keys), then measures how much time it
takes to read them by calling `Seek()` and `Next()`.
$ time ./db_bench -num 10000000 -benchmarks fillseekseq -prefix_size 1 \
-key_size 8 -write_buffer_size $[1024*1024*1024] -value_size 50 \
-seekseq_next 2 -skip_list_lookahead=0
[...]
DB path: [/dev/shm/rocksdbtest/dbbench]
fillseekseq : 0.389 micros/op 2569047 ops/sec;
real 0m21.806s
user 0m12.106s
sys 0m9.672s
$ time ./db_bench [...] -skip_list_lookahead=2
[...]
DB path: [/dev/shm/rocksdbtest/dbbench]
fillseekseq : 0.153 micros/op 6540684 ops/sec;
real 0m19.469s
user 0m10.192s
sys 0m9.252s
Reviewers: ljin, sdong, igor
Reviewed By: igor
Subscribers: dhruba, leveldb, march, lovro
Differential Revision: https://reviews.facebook.net/D23997
2014-09-23 22:52:28 +00:00
|
|
|
assert(Valid());
|
|
|
|
|
|
|
|
bool advance_prev = true;
|
|
|
|
if (prev_.Valid()) {
|
|
|
|
auto k1 = rep_.UserKey(prev_.key());
|
|
|
|
auto k2 = rep_.UserKey(iter_.key());
|
|
|
|
|
|
|
|
if (k1.compare(k2) == 0) {
|
|
|
|
// same user key, don't move prev_
|
|
|
|
advance_prev = false;
|
|
|
|
} else if (rep_.transform_) {
|
|
|
|
// only advance prev_ if it has the same prefix as iter_
|
|
|
|
auto t1 = rep_.transform_->Transform(k1);
|
|
|
|
auto t2 = rep_.transform_->Transform(k2);
|
|
|
|
advance_prev = t1.compare(t2) == 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (advance_prev) {
|
|
|
|
prev_ = iter_;
|
|
|
|
}
|
|
|
|
iter_.Next();
|
|
|
|
}
|
|
|
|
|
2019-02-14 21:52:47 +00:00
|
|
|
void Prev() override {
|
SkipListRep::LookaheadIterator
Summary:
This diff introduces the `lookahead` argument to `SkipListFactory()`. This is an
optimization for the tailing use case which includes many seeks. E.g. consider
the following operations on a skip list iterator:
Seek(x), Next(), Next(), Seek(x+2), Next(), Seek(x+3), Next(), Next(), ...
If `lookahead` is positive, `SkipListRep` will return an iterator which also
keeps track of the previously visited node. Seek() then first does a linear
search starting from that node (up to `lookahead` steps). As in the tailing
example above, this may require fewer than ~log(n) comparisons as with regular
skip list search.
Test Plan:
Added a new benchmark (`fillseekseq`) which simulates the usage pattern. It
first writes N records (with consecutive keys), then measures how much time it
takes to read them by calling `Seek()` and `Next()`.
$ time ./db_bench -num 10000000 -benchmarks fillseekseq -prefix_size 1 \
-key_size 8 -write_buffer_size $[1024*1024*1024] -value_size 50 \
-seekseq_next 2 -skip_list_lookahead=0
[...]
DB path: [/dev/shm/rocksdbtest/dbbench]
fillseekseq : 0.389 micros/op 2569047 ops/sec;
real 0m21.806s
user 0m12.106s
sys 0m9.672s
$ time ./db_bench [...] -skip_list_lookahead=2
[...]
DB path: [/dev/shm/rocksdbtest/dbbench]
fillseekseq : 0.153 micros/op 6540684 ops/sec;
real 0m19.469s
user 0m10.192s
sys 0m9.252s
Reviewers: ljin, sdong, igor
Reviewed By: igor
Subscribers: dhruba, leveldb, march, lovro
Differential Revision: https://reviews.facebook.net/D23997
2014-09-23 22:52:28 +00:00
|
|
|
assert(Valid());
|
|
|
|
iter_.Prev();
|
|
|
|
prev_ = iter_;
|
|
|
|
}
|
|
|
|
|
2019-02-14 21:52:47 +00:00
|
|
|
void Seek(const Slice& internal_key, const char* memtable_key) override {
|
2022-10-28 20:16:50 +00:00
|
|
|
const char* encoded_key = (memtable_key != nullptr)
|
|
|
|
? memtable_key
|
|
|
|
: EncodeKey(&tmp_, internal_key);
|
SkipListRep::LookaheadIterator
Summary:
This diff introduces the `lookahead` argument to `SkipListFactory()`. This is an
optimization for the tailing use case which includes many seeks. E.g. consider
the following operations on a skip list iterator:
Seek(x), Next(), Next(), Seek(x+2), Next(), Seek(x+3), Next(), Next(), ...
If `lookahead` is positive, `SkipListRep` will return an iterator which also
keeps track of the previously visited node. Seek() then first does a linear
search starting from that node (up to `lookahead` steps). As in the tailing
example above, this may require fewer than ~log(n) comparisons as with regular
skip list search.
Test Plan:
Added a new benchmark (`fillseekseq`) which simulates the usage pattern. It
first writes N records (with consecutive keys), then measures how much time it
takes to read them by calling `Seek()` and `Next()`.
$ time ./db_bench -num 10000000 -benchmarks fillseekseq -prefix_size 1 \
-key_size 8 -write_buffer_size $[1024*1024*1024] -value_size 50 \
-seekseq_next 2 -skip_list_lookahead=0
[...]
DB path: [/dev/shm/rocksdbtest/dbbench]
fillseekseq : 0.389 micros/op 2569047 ops/sec;
real 0m21.806s
user 0m12.106s
sys 0m9.672s
$ time ./db_bench [...] -skip_list_lookahead=2
[...]
DB path: [/dev/shm/rocksdbtest/dbbench]
fillseekseq : 0.153 micros/op 6540684 ops/sec;
real 0m19.469s
user 0m10.192s
sys 0m9.252s
Reviewers: ljin, sdong, igor
Reviewed By: igor
Subscribers: dhruba, leveldb, march, lovro
Differential Revision: https://reviews.facebook.net/D23997
2014-09-23 22:52:28 +00:00
|
|
|
|
|
|
|
if (prev_.Valid() && rep_.cmp_(encoded_key, prev_.key()) >= 0) {
|
|
|
|
// prev_.key() is smaller or equal to our target key; do a quick
|
|
|
|
// linear search (at most lookahead_ steps) starting from prev_
|
|
|
|
iter_ = prev_;
|
|
|
|
|
|
|
|
size_t cur = 0;
|
|
|
|
while (cur++ <= rep_.lookahead_ && iter_.Valid()) {
|
|
|
|
if (rep_.cmp_(encoded_key, iter_.key()) <= 0) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
Next();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
iter_.Seek(encoded_key);
|
|
|
|
prev_ = iter_;
|
|
|
|
}
|
|
|
|
|
2019-02-14 21:52:47 +00:00
|
|
|
void SeekForPrev(const Slice& internal_key,
|
|
|
|
const char* memtable_key) override {
|
2016-09-28 01:20:57 +00:00
|
|
|
const char* encoded_key = (memtable_key != nullptr)
|
|
|
|
? memtable_key
|
|
|
|
: EncodeKey(&tmp_, internal_key);
|
|
|
|
iter_.SeekForPrev(encoded_key);
|
|
|
|
prev_ = iter_;
|
|
|
|
}
|
|
|
|
|
2019-02-14 21:52:47 +00:00
|
|
|
void SeekToFirst() override {
|
SkipListRep::LookaheadIterator
Summary:
This diff introduces the `lookahead` argument to `SkipListFactory()`. This is an
optimization for the tailing use case which includes many seeks. E.g. consider
the following operations on a skip list iterator:
Seek(x), Next(), Next(), Seek(x+2), Next(), Seek(x+3), Next(), Next(), ...
If `lookahead` is positive, `SkipListRep` will return an iterator which also
keeps track of the previously visited node. Seek() then first does a linear
search starting from that node (up to `lookahead` steps). As in the tailing
example above, this may require fewer than ~log(n) comparisons as with regular
skip list search.
Test Plan:
Added a new benchmark (`fillseekseq`) which simulates the usage pattern. It
first writes N records (with consecutive keys), then measures how much time it
takes to read them by calling `Seek()` and `Next()`.
$ time ./db_bench -num 10000000 -benchmarks fillseekseq -prefix_size 1 \
-key_size 8 -write_buffer_size $[1024*1024*1024] -value_size 50 \
-seekseq_next 2 -skip_list_lookahead=0
[...]
DB path: [/dev/shm/rocksdbtest/dbbench]
fillseekseq : 0.389 micros/op 2569047 ops/sec;
real 0m21.806s
user 0m12.106s
sys 0m9.672s
$ time ./db_bench [...] -skip_list_lookahead=2
[...]
DB path: [/dev/shm/rocksdbtest/dbbench]
fillseekseq : 0.153 micros/op 6540684 ops/sec;
real 0m19.469s
user 0m10.192s
sys 0m9.252s
Reviewers: ljin, sdong, igor
Reviewed By: igor
Subscribers: dhruba, leveldb, march, lovro
Differential Revision: https://reviews.facebook.net/D23997
2014-09-23 22:52:28 +00:00
|
|
|
iter_.SeekToFirst();
|
|
|
|
prev_ = iter_;
|
|
|
|
}
|
|
|
|
|
2019-02-14 21:52:47 +00:00
|
|
|
void SeekToLast() override {
|
SkipListRep::LookaheadIterator
Summary:
This diff introduces the `lookahead` argument to `SkipListFactory()`. This is an
optimization for the tailing use case which includes many seeks. E.g. consider
the following operations on a skip list iterator:
Seek(x), Next(), Next(), Seek(x+2), Next(), Seek(x+3), Next(), Next(), ...
If `lookahead` is positive, `SkipListRep` will return an iterator which also
keeps track of the previously visited node. Seek() then first does a linear
search starting from that node (up to `lookahead` steps). As in the tailing
example above, this may require fewer than ~log(n) comparisons as with regular
skip list search.
Test Plan:
Added a new benchmark (`fillseekseq`) which simulates the usage pattern. It
first writes N records (with consecutive keys), then measures how much time it
takes to read them by calling `Seek()` and `Next()`.
$ time ./db_bench -num 10000000 -benchmarks fillseekseq -prefix_size 1 \
-key_size 8 -write_buffer_size $[1024*1024*1024] -value_size 50 \
-seekseq_next 2 -skip_list_lookahead=0
[...]
DB path: [/dev/shm/rocksdbtest/dbbench]
fillseekseq : 0.389 micros/op 2569047 ops/sec;
real 0m21.806s
user 0m12.106s
sys 0m9.672s
$ time ./db_bench [...] -skip_list_lookahead=2
[...]
DB path: [/dev/shm/rocksdbtest/dbbench]
fillseekseq : 0.153 micros/op 6540684 ops/sec;
real 0m19.469s
user 0m10.192s
sys 0m9.252s
Reviewers: ljin, sdong, igor
Reviewed By: igor
Subscribers: dhruba, leveldb, march, lovro
Differential Revision: https://reviews.facebook.net/D23997
2014-09-23 22:52:28 +00:00
|
|
|
iter_.SeekToLast();
|
|
|
|
prev_ = iter_;
|
|
|
|
}
|
|
|
|
|
|
|
|
protected:
|
2022-10-28 20:16:50 +00:00
|
|
|
std::string tmp_; // For passing to EncodeKey
|
SkipListRep::LookaheadIterator
Summary:
This diff introduces the `lookahead` argument to `SkipListFactory()`. This is an
optimization for the tailing use case which includes many seeks. E.g. consider
the following operations on a skip list iterator:
Seek(x), Next(), Next(), Seek(x+2), Next(), Seek(x+3), Next(), Next(), ...
If `lookahead` is positive, `SkipListRep` will return an iterator which also
keeps track of the previously visited node. Seek() then first does a linear
search starting from that node (up to `lookahead` steps). As in the tailing
example above, this may require fewer than ~log(n) comparisons as with regular
skip list search.
Test Plan:
Added a new benchmark (`fillseekseq`) which simulates the usage pattern. It
first writes N records (with consecutive keys), then measures how much time it
takes to read them by calling `Seek()` and `Next()`.
$ time ./db_bench -num 10000000 -benchmarks fillseekseq -prefix_size 1 \
-key_size 8 -write_buffer_size $[1024*1024*1024] -value_size 50 \
-seekseq_next 2 -skip_list_lookahead=0
[...]
DB path: [/dev/shm/rocksdbtest/dbbench]
fillseekseq : 0.389 micros/op 2569047 ops/sec;
real 0m21.806s
user 0m12.106s
sys 0m9.672s
$ time ./db_bench [...] -skip_list_lookahead=2
[...]
DB path: [/dev/shm/rocksdbtest/dbbench]
fillseekseq : 0.153 micros/op 6540684 ops/sec;
real 0m19.469s
user 0m10.192s
sys 0m9.252s
Reviewers: ljin, sdong, igor
Reviewed By: igor
Subscribers: dhruba, leveldb, march, lovro
Differential Revision: https://reviews.facebook.net/D23997
2014-09-23 22:52:28 +00:00
|
|
|
|
|
|
|
private:
|
|
|
|
const SkipListRep& rep_;
|
InlineSkipList part 3/3 - new skiplist type that colocates key and node
Summary:
This diff completes the creation of InlineSkipList<Cmp>, which is like
SkipList<const char*, Cmp> but it always allocates the key contiguously
with the node. This allows us to remove the pointer from the node
to the key. As a result the memory usage of the skip list is reduced
(by 1 to sizeof(void*) bytes depending on the padding required to align
the key storage), cache locality is improved, and we halve the number
of calls to the allocator.
For skip lists whose keys are freshly-allocated const char*,
InlineSkipList is stricly preferrable to SkipList. This diff doesn't
replace SkipList, however, because some of the use cases of SkipList in
RocksDB are either character sequences that are not allocated at the
same time as the skip list node allocation (for example
hash_linklist_rep) or have different key types (for example
write_batch_with_index). Taking advantage of inline allocation for
those cases is left to future work.
The perf win is biggest for small values. For single-threaded CPU-bound
(32M fillrandom operations with no WAL log) with 16 byte keys and 0 byte
values, the db_bench perf goes from ~310k ops/sec to ~410k ops/sec. For
large values the improvement is less pronounced, but seems to be between
5% and 10% on the same configuration.
Test Plan: make check
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: dhruba
Differential Revision: https://reviews.facebook.net/D51123
2015-11-19 22:24:29 +00:00
|
|
|
InlineSkipList<const MemTableRep::KeyComparator&>::Iterator iter_;
|
|
|
|
InlineSkipList<const MemTableRep::KeyComparator&>::Iterator prev_;
|
SkipListRep::LookaheadIterator
Summary:
This diff introduces the `lookahead` argument to `SkipListFactory()`. This is an
optimization for the tailing use case which includes many seeks. E.g. consider
the following operations on a skip list iterator:
Seek(x), Next(), Next(), Seek(x+2), Next(), Seek(x+3), Next(), Next(), ...
If `lookahead` is positive, `SkipListRep` will return an iterator which also
keeps track of the previously visited node. Seek() then first does a linear
search starting from that node (up to `lookahead` steps). As in the tailing
example above, this may require fewer than ~log(n) comparisons as with regular
skip list search.
Test Plan:
Added a new benchmark (`fillseekseq`) which simulates the usage pattern. It
first writes N records (with consecutive keys), then measures how much time it
takes to read them by calling `Seek()` and `Next()`.
$ time ./db_bench -num 10000000 -benchmarks fillseekseq -prefix_size 1 \
-key_size 8 -write_buffer_size $[1024*1024*1024] -value_size 50 \
-seekseq_next 2 -skip_list_lookahead=0
[...]
DB path: [/dev/shm/rocksdbtest/dbbench]
fillseekseq : 0.389 micros/op 2569047 ops/sec;
real 0m21.806s
user 0m12.106s
sys 0m9.672s
$ time ./db_bench [...] -skip_list_lookahead=2
[...]
DB path: [/dev/shm/rocksdbtest/dbbench]
fillseekseq : 0.153 micros/op 6540684 ops/sec;
real 0m19.469s
user 0m10.192s
sys 0m9.252s
Reviewers: ljin, sdong, igor
Reviewed By: igor
Subscribers: dhruba, leveldb, march, lovro
Differential Revision: https://reviews.facebook.net/D23997
2014-09-23 22:52:28 +00:00
|
|
|
};
|
|
|
|
|
2019-02-14 21:52:47 +00:00
|
|
|
MemTableRep::Iterator* GetIterator(Arena* arena = nullptr) override {
|
SkipListRep::LookaheadIterator
Summary:
This diff introduces the `lookahead` argument to `SkipListFactory()`. This is an
optimization for the tailing use case which includes many seeks. E.g. consider
the following operations on a skip list iterator:
Seek(x), Next(), Next(), Seek(x+2), Next(), Seek(x+3), Next(), Next(), ...
If `lookahead` is positive, `SkipListRep` will return an iterator which also
keeps track of the previously visited node. Seek() then first does a linear
search starting from that node (up to `lookahead` steps). As in the tailing
example above, this may require fewer than ~log(n) comparisons as with regular
skip list search.
Test Plan:
Added a new benchmark (`fillseekseq`) which simulates the usage pattern. It
first writes N records (with consecutive keys), then measures how much time it
takes to read them by calling `Seek()` and `Next()`.
$ time ./db_bench -num 10000000 -benchmarks fillseekseq -prefix_size 1 \
-key_size 8 -write_buffer_size $[1024*1024*1024] -value_size 50 \
-seekseq_next 2 -skip_list_lookahead=0
[...]
DB path: [/dev/shm/rocksdbtest/dbbench]
fillseekseq : 0.389 micros/op 2569047 ops/sec;
real 0m21.806s
user 0m12.106s
sys 0m9.672s
$ time ./db_bench [...] -skip_list_lookahead=2
[...]
DB path: [/dev/shm/rocksdbtest/dbbench]
fillseekseq : 0.153 micros/op 6540684 ops/sec;
real 0m19.469s
user 0m10.192s
sys 0m9.252s
Reviewers: ljin, sdong, igor
Reviewed By: igor
Subscribers: dhruba, leveldb, march, lovro
Differential Revision: https://reviews.facebook.net/D23997
2014-09-23 22:52:28 +00:00
|
|
|
if (lookahead_ > 0) {
|
2022-10-28 20:16:50 +00:00
|
|
|
void* mem =
|
|
|
|
arena ? arena->AllocateAligned(sizeof(SkipListRep::LookaheadIterator))
|
|
|
|
:
|
|
|
|
operator new(sizeof(SkipListRep::LookaheadIterator));
|
SkipListRep::LookaheadIterator
Summary:
This diff introduces the `lookahead` argument to `SkipListFactory()`. This is an
optimization for the tailing use case which includes many seeks. E.g. consider
the following operations on a skip list iterator:
Seek(x), Next(), Next(), Seek(x+2), Next(), Seek(x+3), Next(), Next(), ...
If `lookahead` is positive, `SkipListRep` will return an iterator which also
keeps track of the previously visited node. Seek() then first does a linear
search starting from that node (up to `lookahead` steps). As in the tailing
example above, this may require fewer than ~log(n) comparisons as with regular
skip list search.
Test Plan:
Added a new benchmark (`fillseekseq`) which simulates the usage pattern. It
first writes N records (with consecutive keys), then measures how much time it
takes to read them by calling `Seek()` and `Next()`.
$ time ./db_bench -num 10000000 -benchmarks fillseekseq -prefix_size 1 \
-key_size 8 -write_buffer_size $[1024*1024*1024] -value_size 50 \
-seekseq_next 2 -skip_list_lookahead=0
[...]
DB path: [/dev/shm/rocksdbtest/dbbench]
fillseekseq : 0.389 micros/op 2569047 ops/sec;
real 0m21.806s
user 0m12.106s
sys 0m9.672s
$ time ./db_bench [...] -skip_list_lookahead=2
[...]
DB path: [/dev/shm/rocksdbtest/dbbench]
fillseekseq : 0.153 micros/op 6540684 ops/sec;
real 0m19.469s
user 0m10.192s
sys 0m9.252s
Reviewers: ljin, sdong, igor
Reviewed By: igor
Subscribers: dhruba, leveldb, march, lovro
Differential Revision: https://reviews.facebook.net/D23997
2014-09-23 22:52:28 +00:00
|
|
|
return new (mem) SkipListRep::LookaheadIterator(*this);
|
In DB::NewIterator(), try to allocate the whole iterator tree in an arena
Summary:
In this patch, try to allocate the whole iterator tree starting from DBIter from an arena
1. ArenaWrappedDBIter is created when serves as the entry point of an iterator tree, with an arena in it.
2. Add an option to create iterator from arena for following iterators: DBIter, MergingIterator, MemtableIterator, all mem table's iterators, all table reader's iterators and two level iterator.
3. MergeIteratorBuilder is created to incrementally build the tree of internal iterators. It is passed to mem table list and version set and add iterators to it.
Limitations:
(1) Only DB::NewIterator() without tailing uses the arena. Other cases, including readonly DB and compactions are still from malloc
(2) Two level iterator itself is allocated in arena, but not iterators inside it.
Test Plan: make all check
Reviewers: ljin, haobo
Reviewed By: haobo
Subscribers: leveldb, dhruba, yhchiang, igor
Differential Revision: https://reviews.facebook.net/D18513
2014-06-02 23:38:00 +00:00
|
|
|
} else {
|
2022-10-28 20:16:50 +00:00
|
|
|
void* mem = arena ? arena->AllocateAligned(sizeof(SkipListRep::Iterator))
|
|
|
|
:
|
|
|
|
operator new(sizeof(SkipListRep::Iterator));
|
In DB::NewIterator(), try to allocate the whole iterator tree in an arena
Summary:
In this patch, try to allocate the whole iterator tree starting from DBIter from an arena
1. ArenaWrappedDBIter is created when serves as the entry point of an iterator tree, with an arena in it.
2. Add an option to create iterator from arena for following iterators: DBIter, MergingIterator, MemtableIterator, all mem table's iterators, all table reader's iterators and two level iterator.
3. MergeIteratorBuilder is created to incrementally build the tree of internal iterators. It is passed to mem table list and version set and add iterators to it.
Limitations:
(1) Only DB::NewIterator() without tailing uses the arena. Other cases, including readonly DB and compactions are still from malloc
(2) Two level iterator itself is allocated in arena, but not iterators inside it.
Test Plan: make all check
Reviewers: ljin, haobo
Reviewed By: haobo
Subscribers: leveldb, dhruba, yhchiang, igor
Differential Revision: https://reviews.facebook.net/D18513
2014-06-02 23:38:00 +00:00
|
|
|
return new (mem) SkipListRep::Iterator(&skip_list_);
|
|
|
|
}
|
2013-07-23 21:42:27 +00:00
|
|
|
}
|
|
|
|
};
|
2022-10-28 20:16:50 +00:00
|
|
|
} // namespace
|
2013-07-23 21:42:27 +00:00
|
|
|
|
2021-09-08 14:45:59 +00:00
|
|
|
static std::unordered_map<std::string, OptionTypeInfo> skiplist_factory_info = {
|
|
|
|
{"lookahead",
|
|
|
|
{0, OptionType::kSizeT, OptionVerificationType::kNormal,
|
|
|
|
OptionTypeFlags::kDontSerialize /*Since it is part of the ID*/}},
|
|
|
|
};
|
|
|
|
|
|
|
|
SkipListFactory::SkipListFactory(size_t lookahead) : lookahead_(lookahead) {
|
|
|
|
RegisterOptions("SkipListFactoryOptions", &lookahead_,
|
|
|
|
&skiplist_factory_info);
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string SkipListFactory::GetId() const {
|
|
|
|
std::string id = Name();
|
|
|
|
if (lookahead_ > 0) {
|
2022-05-06 20:03:58 +00:00
|
|
|
id.append(":").append(std::to_string(lookahead_));
|
2021-09-08 14:45:59 +00:00
|
|
|
}
|
|
|
|
return id;
|
|
|
|
}
|
|
|
|
|
2014-01-16 02:17:58 +00:00
|
|
|
MemTableRep* SkipListFactory::CreateMemTableRep(
|
2017-06-02 21:13:59 +00:00
|
|
|
const MemTableRep::KeyComparator& compare, Allocator* allocator,
|
2018-03-05 21:08:17 +00:00
|
|
|
const SliceTransform* transform, Logger* /*logger*/) {
|
2014-12-02 20:09:20 +00:00
|
|
|
return new SkipListRep(compare, allocator, transform, lookahead_);
|
2013-07-23 21:42:27 +00:00
|
|
|
}
|
|
|
|
|
2020-02-20 20:07:53 +00:00
|
|
|
} // namespace ROCKSDB_NAMESPACE
|