2020-03-13 04:39:36 +00:00
|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
//
|
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
#pragma once
|
2023-09-23 01:12:08 +00:00
|
|
|
#include <deque>
|
|
|
|
|
Support returning write unix time in iterator property (#12428)
Summary:
This PR adds support to return data's approximate unix write time in the iterator property API. The general implementation is:
1) If the entry comes from a SST file, the sequence number to time mapping recorded in that file's table properties will be used to deduce the entry's write time from its sequence number. If no such recording is available, `std::numeric_limits<uint64_t>::max()` is returned to indicate the write time is unknown except if the entry's sequence number is zero, in which case, 0 is returned. This also means that even if `preclude_last_level_data_seconds` and `preserve_internal_time_seconds` can be toggled off between DB reopens, as long as the SST file's table property has the mapping available, the entry's write time can be deduced and returned.
2) If the entry comes from memtable, we will use the DB's sequence number to write time mapping to do similar things. A copy of the DB's seqno to write time mapping is kept in SuperVersion to allow iterators to have lock free access. This also means a new `SuperVersion` is installed each time DB's seqno to time mapping updates, which is originally proposed by Peter in https://github.com/facebook/rocksdb/issues/11928 . Similarly, if the feature is not enabled, `std::numeric_limits<uint64_t>::max()` is returned to indicate the write time is unknown.
Needed follow up:
1) The write time for `kTypeValuePreferredSeqno` should be special cased, where it's already specified by the user, so we can directly return it.
2) Flush job can be updated to use DB's seqno to time mapping copy in the SuperVersion.
3) Handle the case when `TimedPut` is called with a write time that is `std::numeric_limits<uint64_t>::max()`. We can make it a regular `Put`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12428
Test Plan: Added unit test
Reviewed By: pdillinger
Differential Revision: D54967067
Pulled By: jowlyzhang
fbshipit-source-id: c795b1b7ec142e09e53f2ed3461cf719833cb37a
2024-03-15 22:37:37 +00:00
|
|
|
#include "db/seqno_to_time_mapping.h"
|
2020-03-13 04:39:36 +00:00
|
|
|
#include "table/block_based/block_based_table_reader.h"
|
|
|
|
#include "table/block_based/block_based_table_reader_impl.h"
|
De-template block based table iterator (#6531)
Summary:
Right now block based table iterator is used as both of iterating data for block based table, and for the index iterator for partitioend index. This was initially convenient for introducing a new iterator and block type for new index format, while reducing code change. However, these two usage doesn't go with each other very well. For example, Prev() is never called for partitioned index iterator, and some other complexity is maintained in block based iterators, which is not needed for index iterator but maintainers will always need to reason about it. Furthermore, the template usage is not following Google C++ Style which we are following, and makes a large chunk of code tangled together. This commit separate the two iterators. Right now, here is what it is done:
1. Copy the block based iterator code into partitioned index iterator, and de-template them.
2. Remove some code not needed for partitioned index. The upper bound check and tricks are removed. We never tested performance for those tricks when partitioned index is enabled in the first place. It's unlikelyl to generate performance regression, as creating new partitioned index block is much rarer than data blocks.
3. Separate out the prefetch logic to a helper class and both classes call them.
This commit will enable future follow-ups. One direction is that we might separate index iterator interface for data blocks and index blocks, as they are quite different.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6531
Test Plan: build using make and cmake. And build release
Differential Revision: D20473108
fbshipit-source-id: e48011783b339a4257c204cc07507b171b834b0f
2020-03-16 19:17:34 +00:00
|
|
|
#include "table/block_based/block_prefetcher.h"
|
2020-03-13 04:39:36 +00:00
|
|
|
#include "table/block_based/reader_common.h"
|
|
|
|
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
// Iterates over the contents of BlockBasedTable.
|
De-template block based table iterator (#6531)
Summary:
Right now block based table iterator is used as both of iterating data for block based table, and for the index iterator for partitioend index. This was initially convenient for introducing a new iterator and block type for new index format, while reducing code change. However, these two usage doesn't go with each other very well. For example, Prev() is never called for partitioned index iterator, and some other complexity is maintained in block based iterators, which is not needed for index iterator but maintainers will always need to reason about it. Furthermore, the template usage is not following Google C++ Style which we are following, and makes a large chunk of code tangled together. This commit separate the two iterators. Right now, here is what it is done:
1. Copy the block based iterator code into partitioned index iterator, and de-template them.
2. Remove some code not needed for partitioned index. The upper bound check and tricks are removed. We never tested performance for those tricks when partitioned index is enabled in the first place. It's unlikelyl to generate performance regression, as creating new partitioned index block is much rarer than data blocks.
3. Separate out the prefetch logic to a helper class and both classes call them.
This commit will enable future follow-ups. One direction is that we might separate index iterator interface for data blocks and index blocks, as they are quite different.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6531
Test Plan: build using make and cmake. And build release
Differential Revision: D20473108
fbshipit-source-id: e48011783b339a4257c204cc07507b171b834b0f
2020-03-16 19:17:34 +00:00
|
|
|
class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
|
2020-03-13 04:39:36 +00:00
|
|
|
// compaction_readahead_size: its value will only be used if for_compaction =
|
|
|
|
// true
|
2020-08-03 22:21:56 +00:00
|
|
|
// @param read_options Must outlive this iterator.
|
2020-03-13 04:39:36 +00:00
|
|
|
public:
|
De-template block based table iterator (#6531)
Summary:
Right now block based table iterator is used as both of iterating data for block based table, and for the index iterator for partitioend index. This was initially convenient for introducing a new iterator and block type for new index format, while reducing code change. However, these two usage doesn't go with each other very well. For example, Prev() is never called for partitioned index iterator, and some other complexity is maintained in block based iterators, which is not needed for index iterator but maintainers will always need to reason about it. Furthermore, the template usage is not following Google C++ Style which we are following, and makes a large chunk of code tangled together. This commit separate the two iterators. Right now, here is what it is done:
1. Copy the block based iterator code into partitioned index iterator, and de-template them.
2. Remove some code not needed for partitioned index. The upper bound check and tricks are removed. We never tested performance for those tricks when partitioned index is enabled in the first place. It's unlikelyl to generate performance regression, as creating new partitioned index block is much rarer than data blocks.
3. Separate out the prefetch logic to a helper class and both classes call them.
This commit will enable future follow-ups. One direction is that we might separate index iterator interface for data blocks and index blocks, as they are quite different.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6531
Test Plan: build using make and cmake. And build release
Differential Revision: D20473108
fbshipit-source-id: e48011783b339a4257c204cc07507b171b834b0f
2020-03-16 19:17:34 +00:00
|
|
|
BlockBasedTableIterator(
|
|
|
|
const BlockBasedTable* table, const ReadOptions& read_options,
|
|
|
|
const InternalKeyComparator& icomp,
|
|
|
|
std::unique_ptr<InternalIteratorBase<IndexValue>>&& index_iter,
|
|
|
|
bool check_filter, bool need_upper_bound_check,
|
|
|
|
const SliceTransform* prefix_extractor, TableReaderCaller caller,
|
2020-08-03 22:21:56 +00:00
|
|
|
size_t compaction_readahead_size = 0, bool allow_unprepared_value = false)
|
2021-11-11 00:18:27 +00:00
|
|
|
: index_iter_(std::move(index_iter)),
|
|
|
|
table_(table),
|
2020-03-13 04:39:36 +00:00
|
|
|
read_options_(read_options),
|
|
|
|
icomp_(icomp),
|
|
|
|
user_comparator_(icomp.user_comparator()),
|
|
|
|
pinned_iters_mgr_(nullptr),
|
|
|
|
prefix_extractor_(prefix_extractor),
|
|
|
|
lookup_context_(caller),
|
2022-04-16 00:28:09 +00:00
|
|
|
block_prefetcher_(
|
|
|
|
compaction_readahead_size,
|
|
|
|
table_->get_rep()->table_options.initial_auto_readahead_size),
|
2020-08-03 22:21:56 +00:00
|
|
|
allow_unprepared_value_(allow_unprepared_value),
|
|
|
|
block_iter_points_to_real_block_(false),
|
|
|
|
check_filter_(check_filter),
|
2022-05-20 23:09:33 +00:00
|
|
|
need_upper_bound_check_(need_upper_bound_check),
|
Much better stats for seeks and prefix filtering (#11460)
Summary:
We want to know more about opportunities for better range filters, and the effectiveness of our own range filters. Currently the stats are very limited, essentially logging just hits and misses against prefix filters for range scans in BLOOM_FILTER_PREFIX_* without tracking the false positive rate. Perhaps confusingly, when prefix filters are used for point queries, the stats are currently going into the non-PREFIX tickers.
This change does several things:
* Introduce new stat tickers for seeks and related filtering, \*LEVEL_SEEK\*
* Most importantly, allows us to see opportunities for range filtering. Specifically, we can count how many times a seek in an SST file accesses at least one data block, and how many times at least one value() is then accessed. If a data block was accessed but no value(), we can generally assume that the key(s) seen was(were) not of interest so could have been filtered with the right kind of filter, avoiding the data block access.
* We can get the same level of detail when a filter (for now, prefix Bloom/ribbon) is used, or not. Specifically, we can infer a false positive rate for prefix filters (not available before) from the seek "false positive" rate: when a data block is accessed but no value() is called. (There can be other explanations for a seek false positive, but in typical iterator usage it would indicate a filter false positive.)
* For efficiency, I wanted to avoid making additional calls to the prefix extractor (or key comparisons, etc.), which would be required if we wanted to more precisely detect filter false positives. I believe that instrumenting value() is the best balance of efficiency vs. accurately measuring what we are often interested in.
* The stats are divided between last level and non-last levels, to help understand potential tiered storage use cases.
* The old BLOOM_FILTER_PREFIX_* stats have a different meaning: no longer referring to iterators but to point queries using prefix filters. BLOOM_FILTER_PREFIX_TRUE_POSITIVE is added for computing the prefix false positive rate on point queries, which can be due to filter false positives as well as different keys with the same prefix.
* Similarly, the non-PREFIX BLOOM_FILTER stats are now for whole key filtering only.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11460
Test Plan:
unit tests updated, including updating many to pop the stat value since last read to improve test
readability and maintainability.
Performance test shows a consistent small improvement with these changes, both with clang and with gcc. CPU profile indicates that RecordTick is using less CPU, and this makes sense at least for a high filter miss rate. Before, we were recording two ticks per filter miss in iterators (CHECKED & USEFUL) and now recording just one (FILTERED).
Create DB with
```
TEST_TMPDIR=/dev/shm ./db_bench -benchmarks=fillrandom -num=10000000 -disable_wal=1 -write_buffer_size=30000000 -bloom_bits=8 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=0 -prefix_size=8
```
And run simultaneous before&after with
```
TEST_TMPDIR=/dev/shm ./db_bench -readonly -benchmarks=seekrandom[-X1000] -num=10000000 -bloom_bits=8 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=0 -prefix_size=8 -seek_nexts=1 -duration=20 -seed=43 -threads=8 -cache_size=1000000000 -statistics
```
Before: seekrandom [AVG 275 runs] : 189680 (± 222) ops/sec; 18.4 (± 0.0) MB/sec
After: seekrandom [AVG 275 runs] : 197110 (± 208) ops/sec; 19.1 (± 0.0) MB/sec
Reviewed By: ajkr
Differential Revision: D46029177
Pulled By: pdillinger
fbshipit-source-id: cdace79a2ea548d46c5900b068c5b7c3a02e5822
2023-05-19 22:25:49 +00:00
|
|
|
async_read_in_progress_(false),
|
|
|
|
is_last_level_(table->IsLastLevel()) {}
|
2020-03-13 04:39:36 +00:00
|
|
|
|
2023-09-23 01:12:08 +00:00
|
|
|
~BlockBasedTableIterator() override { ClearBlockHandles(); }
|
2020-03-13 04:39:36 +00:00
|
|
|
|
|
|
|
void Seek(const Slice& target) override;
|
|
|
|
void SeekForPrev(const Slice& target) override;
|
|
|
|
void SeekToFirst() override;
|
|
|
|
void SeekToLast() override;
|
|
|
|
void Next() final override;
|
|
|
|
bool NextAndGetResult(IterateResult* result) override;
|
|
|
|
void Prev() override;
|
|
|
|
bool Valid() const override {
|
|
|
|
return !is_out_of_bound_ &&
|
|
|
|
(is_at_first_key_from_index_ ||
|
|
|
|
(block_iter_points_to_real_block_ && block_iter_.Valid()));
|
|
|
|
}
|
2023-09-23 01:12:08 +00:00
|
|
|
|
|
|
|
// For block cache readahead lookup scenario -
|
|
|
|
// If is_at_first_key_from_index_ is true, InitDataBlock hasn't been
|
|
|
|
// called. It means block_handles is empty and index_ point to current block.
|
|
|
|
// So index_iter_ can be accessed directly.
|
2020-03-13 04:39:36 +00:00
|
|
|
Slice key() const override {
|
|
|
|
assert(Valid());
|
|
|
|
if (is_at_first_key_from_index_) {
|
|
|
|
return index_iter_->value().first_internal_key;
|
|
|
|
} else {
|
|
|
|
return block_iter_.key();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
Slice user_key() const override {
|
|
|
|
assert(Valid());
|
|
|
|
if (is_at_first_key_from_index_) {
|
|
|
|
return ExtractUserKey(index_iter_->value().first_internal_key);
|
|
|
|
} else {
|
|
|
|
return block_iter_.user_key();
|
|
|
|
}
|
|
|
|
}
|
2023-09-23 01:12:08 +00:00
|
|
|
|
Properly report IO errors when IndexType::kBinarySearchWithFirstKey is used (#6621)
Summary:
Context: Index type `kBinarySearchWithFirstKey` added the ability for sst file iterator to sometimes report a key from index without reading the corresponding data block. This is useful when sst blocks are cut at some meaningful boundaries (e.g. one block per key prefix), and many seeks land between blocks (e.g. for each prefix, the ranges of keys in different sst files are nearly disjoint, so a typical seek needs to read a data block from only one file even if all files have the prefix). But this added a new error condition, which rocksdb code was really not equipped to deal with: `InternalIterator::value()` may fail with an IO error or Status::Incomplete, but it's just a method returning a Slice, with no way to report error instead. Before this PR, this type of error wasn't handled at all (an empty slice was returned), and kBinarySearchWithFirstKey implementation was considered a prototype.
Now that we (LogDevice) have experimented with kBinarySearchWithFirstKey for a while and confirmed that it's really useful, this PR is adding the missing error handling.
It's a pretty inconvenient situation implementation-wise. The error needs to be reported from InternalIterator when trying to access value. But there are ~700 call sites of `InternalIterator::value()`, most of which either can't hit the error condition (because the iterator is reading from memtable or from index or something) or wouldn't benefit from the deferred loading of the value (e.g. compaction iterator that reads all values anyway). Adding error handling to all these call sites would needlessly bloat the code. So instead I made the deferred value loading optional: only the call sites that may use deferred loading have to call the new method `PrepareValue()` before calling `value()`. The feature is enabled with a new bool argument `allow_unprepared_value` to a bunch of methods that create iterators (it wouldn't make sense to put it in ReadOptions because it's completely internal to iterators, with virtually no user-visible effect). Lmk if you have better ideas.
Note that the deferred value loading only happens for *internal* iterators. The user-visible iterator (DBIter) always prepares the value before returning from Seek/Next/etc. We could go further and add an API to defer that value loading too, but that's most likely not useful for LogDevice, so it doesn't seem worth the complexity for now.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6621
Test Plan: make -j5 check . Will also deploy to some logdevice test clusters and look at stats.
Reviewed By: siying
Differential Revision: D20786930
Pulled By: al13n321
fbshipit-source-id: 6da77d918bad3780522e918f17f4d5513d3e99ee
2020-04-16 00:37:23 +00:00
|
|
|
bool PrepareValue() override {
|
2020-03-13 04:39:36 +00:00
|
|
|
assert(Valid());
|
|
|
|
|
Properly report IO errors when IndexType::kBinarySearchWithFirstKey is used (#6621)
Summary:
Context: Index type `kBinarySearchWithFirstKey` added the ability for sst file iterator to sometimes report a key from index without reading the corresponding data block. This is useful when sst blocks are cut at some meaningful boundaries (e.g. one block per key prefix), and many seeks land between blocks (e.g. for each prefix, the ranges of keys in different sst files are nearly disjoint, so a typical seek needs to read a data block from only one file even if all files have the prefix). But this added a new error condition, which rocksdb code was really not equipped to deal with: `InternalIterator::value()` may fail with an IO error or Status::Incomplete, but it's just a method returning a Slice, with no way to report error instead. Before this PR, this type of error wasn't handled at all (an empty slice was returned), and kBinarySearchWithFirstKey implementation was considered a prototype.
Now that we (LogDevice) have experimented with kBinarySearchWithFirstKey for a while and confirmed that it's really useful, this PR is adding the missing error handling.
It's a pretty inconvenient situation implementation-wise. The error needs to be reported from InternalIterator when trying to access value. But there are ~700 call sites of `InternalIterator::value()`, most of which either can't hit the error condition (because the iterator is reading from memtable or from index or something) or wouldn't benefit from the deferred loading of the value (e.g. compaction iterator that reads all values anyway). Adding error handling to all these call sites would needlessly bloat the code. So instead I made the deferred value loading optional: only the call sites that may use deferred loading have to call the new method `PrepareValue()` before calling `value()`. The feature is enabled with a new bool argument `allow_unprepared_value` to a bunch of methods that create iterators (it wouldn't make sense to put it in ReadOptions because it's completely internal to iterators, with virtually no user-visible effect). Lmk if you have better ideas.
Note that the deferred value loading only happens for *internal* iterators. The user-visible iterator (DBIter) always prepares the value before returning from Seek/Next/etc. We could go further and add an API to defer that value loading too, but that's most likely not useful for LogDevice, so it doesn't seem worth the complexity for now.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6621
Test Plan: make -j5 check . Will also deploy to some logdevice test clusters and look at stats.
Reviewed By: siying
Differential Revision: D20786930
Pulled By: al13n321
fbshipit-source-id: 6da77d918bad3780522e918f17f4d5513d3e99ee
2020-04-16 00:37:23 +00:00
|
|
|
if (!is_at_first_key_from_index_) {
|
|
|
|
return true;
|
2020-03-13 04:39:36 +00:00
|
|
|
}
|
|
|
|
|
Properly report IO errors when IndexType::kBinarySearchWithFirstKey is used (#6621)
Summary:
Context: Index type `kBinarySearchWithFirstKey` added the ability for sst file iterator to sometimes report a key from index without reading the corresponding data block. This is useful when sst blocks are cut at some meaningful boundaries (e.g. one block per key prefix), and many seeks land between blocks (e.g. for each prefix, the ranges of keys in different sst files are nearly disjoint, so a typical seek needs to read a data block from only one file even if all files have the prefix). But this added a new error condition, which rocksdb code was really not equipped to deal with: `InternalIterator::value()` may fail with an IO error or Status::Incomplete, but it's just a method returning a Slice, with no way to report error instead. Before this PR, this type of error wasn't handled at all (an empty slice was returned), and kBinarySearchWithFirstKey implementation was considered a prototype.
Now that we (LogDevice) have experimented with kBinarySearchWithFirstKey for a while and confirmed that it's really useful, this PR is adding the missing error handling.
It's a pretty inconvenient situation implementation-wise. The error needs to be reported from InternalIterator when trying to access value. But there are ~700 call sites of `InternalIterator::value()`, most of which either can't hit the error condition (because the iterator is reading from memtable or from index or something) or wouldn't benefit from the deferred loading of the value (e.g. compaction iterator that reads all values anyway). Adding error handling to all these call sites would needlessly bloat the code. So instead I made the deferred value loading optional: only the call sites that may use deferred loading have to call the new method `PrepareValue()` before calling `value()`. The feature is enabled with a new bool argument `allow_unprepared_value` to a bunch of methods that create iterators (it wouldn't make sense to put it in ReadOptions because it's completely internal to iterators, with virtually no user-visible effect). Lmk if you have better ideas.
Note that the deferred value loading only happens for *internal* iterators. The user-visible iterator (DBIter) always prepares the value before returning from Seek/Next/etc. We could go further and add an API to defer that value loading too, but that's most likely not useful for LogDevice, so it doesn't seem worth the complexity for now.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6621
Test Plan: make -j5 check . Will also deploy to some logdevice test clusters and look at stats.
Reviewed By: siying
Differential Revision: D20786930
Pulled By: al13n321
fbshipit-source-id: 6da77d918bad3780522e918f17f4d5513d3e99ee
2020-04-16 00:37:23 +00:00
|
|
|
return const_cast<BlockBasedTableIterator*>(this)
|
|
|
|
->MaterializeCurrentBlock();
|
|
|
|
}
|
Support returning write unix time in iterator property (#12428)
Summary:
This PR adds support to return data's approximate unix write time in the iterator property API. The general implementation is:
1) If the entry comes from a SST file, the sequence number to time mapping recorded in that file's table properties will be used to deduce the entry's write time from its sequence number. If no such recording is available, `std::numeric_limits<uint64_t>::max()` is returned to indicate the write time is unknown except if the entry's sequence number is zero, in which case, 0 is returned. This also means that even if `preclude_last_level_data_seconds` and `preserve_internal_time_seconds` can be toggled off between DB reopens, as long as the SST file's table property has the mapping available, the entry's write time can be deduced and returned.
2) If the entry comes from memtable, we will use the DB's sequence number to write time mapping to do similar things. A copy of the DB's seqno to write time mapping is kept in SuperVersion to allow iterators to have lock free access. This also means a new `SuperVersion` is installed each time DB's seqno to time mapping updates, which is originally proposed by Peter in https://github.com/facebook/rocksdb/issues/11928 . Similarly, if the feature is not enabled, `std::numeric_limits<uint64_t>::max()` is returned to indicate the write time is unknown.
Needed follow up:
1) The write time for `kTypeValuePreferredSeqno` should be special cased, where it's already specified by the user, so we can directly return it.
2) Flush job can be updated to use DB's seqno to time mapping copy in the SuperVersion.
3) Handle the case when `TimedPut` is called with a write time that is `std::numeric_limits<uint64_t>::max()`. We can make it a regular `Put`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12428
Test Plan: Added unit test
Reviewed By: pdillinger
Differential Revision: D54967067
Pulled By: jowlyzhang
fbshipit-source-id: c795b1b7ec142e09e53f2ed3461cf719833cb37a
2024-03-15 22:37:37 +00:00
|
|
|
|
|
|
|
uint64_t write_unix_time() const override {
|
|
|
|
assert(Valid());
|
2024-03-21 17:00:15 +00:00
|
|
|
ParsedInternalKey pikey;
|
|
|
|
SequenceNumber seqno;
|
Support returning write unix time in iterator property (#12428)
Summary:
This PR adds support to return data's approximate unix write time in the iterator property API. The general implementation is:
1) If the entry comes from a SST file, the sequence number to time mapping recorded in that file's table properties will be used to deduce the entry's write time from its sequence number. If no such recording is available, `std::numeric_limits<uint64_t>::max()` is returned to indicate the write time is unknown except if the entry's sequence number is zero, in which case, 0 is returned. This also means that even if `preclude_last_level_data_seconds` and `preserve_internal_time_seconds` can be toggled off between DB reopens, as long as the SST file's table property has the mapping available, the entry's write time can be deduced and returned.
2) If the entry comes from memtable, we will use the DB's sequence number to write time mapping to do similar things. A copy of the DB's seqno to write time mapping is kept in SuperVersion to allow iterators to have lock free access. This also means a new `SuperVersion` is installed each time DB's seqno to time mapping updates, which is originally proposed by Peter in https://github.com/facebook/rocksdb/issues/11928 . Similarly, if the feature is not enabled, `std::numeric_limits<uint64_t>::max()` is returned to indicate the write time is unknown.
Needed follow up:
1) The write time for `kTypeValuePreferredSeqno` should be special cased, where it's already specified by the user, so we can directly return it.
2) Flush job can be updated to use DB's seqno to time mapping copy in the SuperVersion.
3) Handle the case when `TimedPut` is called with a write time that is `std::numeric_limits<uint64_t>::max()`. We can make it a regular `Put`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12428
Test Plan: Added unit test
Reviewed By: pdillinger
Differential Revision: D54967067
Pulled By: jowlyzhang
fbshipit-source-id: c795b1b7ec142e09e53f2ed3461cf719833cb37a
2024-03-15 22:37:37 +00:00
|
|
|
const SeqnoToTimeMapping& seqno_to_time_mapping =
|
|
|
|
table_->GetSeqnoToTimeMapping();
|
2024-03-21 17:00:15 +00:00
|
|
|
Status s = ParseInternalKey(key(), &pikey, /*log_err_key=*/false);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return std::numeric_limits<uint64_t>::max();
|
|
|
|
} else if (kUnknownSeqnoBeforeAll == pikey.sequence) {
|
Support returning write unix time in iterator property (#12428)
Summary:
This PR adds support to return data's approximate unix write time in the iterator property API. The general implementation is:
1) If the entry comes from a SST file, the sequence number to time mapping recorded in that file's table properties will be used to deduce the entry's write time from its sequence number. If no such recording is available, `std::numeric_limits<uint64_t>::max()` is returned to indicate the write time is unknown except if the entry's sequence number is zero, in which case, 0 is returned. This also means that even if `preclude_last_level_data_seconds` and `preserve_internal_time_seconds` can be toggled off between DB reopens, as long as the SST file's table property has the mapping available, the entry's write time can be deduced and returned.
2) If the entry comes from memtable, we will use the DB's sequence number to write time mapping to do similar things. A copy of the DB's seqno to write time mapping is kept in SuperVersion to allow iterators to have lock free access. This also means a new `SuperVersion` is installed each time DB's seqno to time mapping updates, which is originally proposed by Peter in https://github.com/facebook/rocksdb/issues/11928 . Similarly, if the feature is not enabled, `std::numeric_limits<uint64_t>::max()` is returned to indicate the write time is unknown.
Needed follow up:
1) The write time for `kTypeValuePreferredSeqno` should be special cased, where it's already specified by the user, so we can directly return it.
2) Flush job can be updated to use DB's seqno to time mapping copy in the SuperVersion.
3) Handle the case when `TimedPut` is called with a write time that is `std::numeric_limits<uint64_t>::max()`. We can make it a regular `Put`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12428
Test Plan: Added unit test
Reviewed By: pdillinger
Differential Revision: D54967067
Pulled By: jowlyzhang
fbshipit-source-id: c795b1b7ec142e09e53f2ed3461cf719833cb37a
2024-03-15 22:37:37 +00:00
|
|
|
return kUnknownTimeBeforeAll;
|
|
|
|
} else if (seqno_to_time_mapping.Empty()) {
|
|
|
|
return std::numeric_limits<uint64_t>::max();
|
2024-03-21 17:00:15 +00:00
|
|
|
} else if (kTypeValuePreferredSeqno == pikey.type) {
|
|
|
|
seqno = ParsePackedValueForSeqno(value());
|
|
|
|
} else {
|
|
|
|
seqno = pikey.sequence;
|
Support returning write unix time in iterator property (#12428)
Summary:
This PR adds support to return data's approximate unix write time in the iterator property API. The general implementation is:
1) If the entry comes from a SST file, the sequence number to time mapping recorded in that file's table properties will be used to deduce the entry's write time from its sequence number. If no such recording is available, `std::numeric_limits<uint64_t>::max()` is returned to indicate the write time is unknown except if the entry's sequence number is zero, in which case, 0 is returned. This also means that even if `preclude_last_level_data_seconds` and `preserve_internal_time_seconds` can be toggled off between DB reopens, as long as the SST file's table property has the mapping available, the entry's write time can be deduced and returned.
2) If the entry comes from memtable, we will use the DB's sequence number to write time mapping to do similar things. A copy of the DB's seqno to write time mapping is kept in SuperVersion to allow iterators to have lock free access. This also means a new `SuperVersion` is installed each time DB's seqno to time mapping updates, which is originally proposed by Peter in https://github.com/facebook/rocksdb/issues/11928 . Similarly, if the feature is not enabled, `std::numeric_limits<uint64_t>::max()` is returned to indicate the write time is unknown.
Needed follow up:
1) The write time for `kTypeValuePreferredSeqno` should be special cased, where it's already specified by the user, so we can directly return it.
2) Flush job can be updated to use DB's seqno to time mapping copy in the SuperVersion.
3) Handle the case when `TimedPut` is called with a write time that is `std::numeric_limits<uint64_t>::max()`. We can make it a regular `Put`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12428
Test Plan: Added unit test
Reviewed By: pdillinger
Differential Revision: D54967067
Pulled By: jowlyzhang
fbshipit-source-id: c795b1b7ec142e09e53f2ed3461cf719833cb37a
2024-03-15 22:37:37 +00:00
|
|
|
}
|
|
|
|
return seqno_to_time_mapping.GetProximalTimeBeforeSeqno(seqno);
|
|
|
|
}
|
|
|
|
|
Properly report IO errors when IndexType::kBinarySearchWithFirstKey is used (#6621)
Summary:
Context: Index type `kBinarySearchWithFirstKey` added the ability for sst file iterator to sometimes report a key from index without reading the corresponding data block. This is useful when sst blocks are cut at some meaningful boundaries (e.g. one block per key prefix), and many seeks land between blocks (e.g. for each prefix, the ranges of keys in different sst files are nearly disjoint, so a typical seek needs to read a data block from only one file even if all files have the prefix). But this added a new error condition, which rocksdb code was really not equipped to deal with: `InternalIterator::value()` may fail with an IO error or Status::Incomplete, but it's just a method returning a Slice, with no way to report error instead. Before this PR, this type of error wasn't handled at all (an empty slice was returned), and kBinarySearchWithFirstKey implementation was considered a prototype.
Now that we (LogDevice) have experimented with kBinarySearchWithFirstKey for a while and confirmed that it's really useful, this PR is adding the missing error handling.
It's a pretty inconvenient situation implementation-wise. The error needs to be reported from InternalIterator when trying to access value. But there are ~700 call sites of `InternalIterator::value()`, most of which either can't hit the error condition (because the iterator is reading from memtable or from index or something) or wouldn't benefit from the deferred loading of the value (e.g. compaction iterator that reads all values anyway). Adding error handling to all these call sites would needlessly bloat the code. So instead I made the deferred value loading optional: only the call sites that may use deferred loading have to call the new method `PrepareValue()` before calling `value()`. The feature is enabled with a new bool argument `allow_unprepared_value` to a bunch of methods that create iterators (it wouldn't make sense to put it in ReadOptions because it's completely internal to iterators, with virtually no user-visible effect). Lmk if you have better ideas.
Note that the deferred value loading only happens for *internal* iterators. The user-visible iterator (DBIter) always prepares the value before returning from Seek/Next/etc. We could go further and add an API to defer that value loading too, but that's most likely not useful for LogDevice, so it doesn't seem worth the complexity for now.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6621
Test Plan: make -j5 check . Will also deploy to some logdevice test clusters and look at stats.
Reviewed By: siying
Differential Revision: D20786930
Pulled By: al13n321
fbshipit-source-id: 6da77d918bad3780522e918f17f4d5513d3e99ee
2020-04-16 00:37:23 +00:00
|
|
|
Slice value() const override {
|
|
|
|
// PrepareValue() must have been called.
|
|
|
|
assert(!is_at_first_key_from_index_);
|
|
|
|
assert(Valid());
|
|
|
|
|
Much better stats for seeks and prefix filtering (#11460)
Summary:
We want to know more about opportunities for better range filters, and the effectiveness of our own range filters. Currently the stats are very limited, essentially logging just hits and misses against prefix filters for range scans in BLOOM_FILTER_PREFIX_* without tracking the false positive rate. Perhaps confusingly, when prefix filters are used for point queries, the stats are currently going into the non-PREFIX tickers.
This change does several things:
* Introduce new stat tickers for seeks and related filtering, \*LEVEL_SEEK\*
* Most importantly, allows us to see opportunities for range filtering. Specifically, we can count how many times a seek in an SST file accesses at least one data block, and how many times at least one value() is then accessed. If a data block was accessed but no value(), we can generally assume that the key(s) seen was(were) not of interest so could have been filtered with the right kind of filter, avoiding the data block access.
* We can get the same level of detail when a filter (for now, prefix Bloom/ribbon) is used, or not. Specifically, we can infer a false positive rate for prefix filters (not available before) from the seek "false positive" rate: when a data block is accessed but no value() is called. (There can be other explanations for a seek false positive, but in typical iterator usage it would indicate a filter false positive.)
* For efficiency, I wanted to avoid making additional calls to the prefix extractor (or key comparisons, etc.), which would be required if we wanted to more precisely detect filter false positives. I believe that instrumenting value() is the best balance of efficiency vs. accurately measuring what we are often interested in.
* The stats are divided between last level and non-last levels, to help understand potential tiered storage use cases.
* The old BLOOM_FILTER_PREFIX_* stats have a different meaning: no longer referring to iterators but to point queries using prefix filters. BLOOM_FILTER_PREFIX_TRUE_POSITIVE is added for computing the prefix false positive rate on point queries, which can be due to filter false positives as well as different keys with the same prefix.
* Similarly, the non-PREFIX BLOOM_FILTER stats are now for whole key filtering only.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11460
Test Plan:
unit tests updated, including updating many to pop the stat value since last read to improve test
readability and maintainability.
Performance test shows a consistent small improvement with these changes, both with clang and with gcc. CPU profile indicates that RecordTick is using less CPU, and this makes sense at least for a high filter miss rate. Before, we were recording two ticks per filter miss in iterators (CHECKED & USEFUL) and now recording just one (FILTERED).
Create DB with
```
TEST_TMPDIR=/dev/shm ./db_bench -benchmarks=fillrandom -num=10000000 -disable_wal=1 -write_buffer_size=30000000 -bloom_bits=8 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=0 -prefix_size=8
```
And run simultaneous before&after with
```
TEST_TMPDIR=/dev/shm ./db_bench -readonly -benchmarks=seekrandom[-X1000] -num=10000000 -bloom_bits=8 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=0 -prefix_size=8 -seek_nexts=1 -duration=20 -seed=43 -threads=8 -cache_size=1000000000 -statistics
```
Before: seekrandom [AVG 275 runs] : 189680 (± 222) ops/sec; 18.4 (± 0.0) MB/sec
After: seekrandom [AVG 275 runs] : 197110 (± 208) ops/sec; 19.1 (± 0.0) MB/sec
Reviewed By: ajkr
Differential Revision: D46029177
Pulled By: pdillinger
fbshipit-source-id: cdace79a2ea548d46c5900b068c5b7c3a02e5822
2023-05-19 22:25:49 +00:00
|
|
|
if (seek_stat_state_ & kReportOnUseful) {
|
|
|
|
bool filter_used = (seek_stat_state_ & kFilterUsed) != 0;
|
|
|
|
RecordTick(
|
|
|
|
table_->GetStatistics(),
|
|
|
|
filter_used
|
|
|
|
? (is_last_level_ ? LAST_LEVEL_SEEK_DATA_USEFUL_FILTER_MATCH
|
|
|
|
: NON_LAST_LEVEL_SEEK_DATA_USEFUL_FILTER_MATCH)
|
|
|
|
: (is_last_level_ ? LAST_LEVEL_SEEK_DATA_USEFUL_NO_FILTER
|
|
|
|
: NON_LAST_LEVEL_SEEK_DATA_USEFUL_NO_FILTER));
|
|
|
|
seek_stat_state_ = kDataBlockReadSinceLastSeek;
|
|
|
|
}
|
|
|
|
|
2020-03-13 04:39:36 +00:00
|
|
|
return block_iter_.value();
|
|
|
|
}
|
|
|
|
Status status() const override {
|
2023-09-23 01:12:08 +00:00
|
|
|
// In case of block cache readahead lookup, it won't add the block to
|
|
|
|
// block_handles if it's index is invalid. So index_iter_->status check can
|
|
|
|
// be skipped.
|
|
|
|
// Prefix index set status to NotFound when the prefix does not exist.
|
|
|
|
if (IsIndexAtCurr() && !index_iter_->status().ok() &&
|
|
|
|
!index_iter_->status().IsNotFound()) {
|
2020-03-13 04:39:36 +00:00
|
|
|
return index_iter_->status();
|
|
|
|
} else if (block_iter_points_to_real_block_) {
|
|
|
|
return block_iter_.status();
|
2022-05-20 23:09:33 +00:00
|
|
|
} else if (async_read_in_progress_) {
|
2024-01-30 00:31:09 +00:00
|
|
|
return Status::TryAgain("Async read in progress");
|
2020-03-13 04:39:36 +00:00
|
|
|
} else {
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-08-05 17:42:56 +00:00
|
|
|
inline IterBoundCheck UpperBoundCheckResult() override {
|
|
|
|
if (is_out_of_bound_) {
|
|
|
|
return IterBoundCheck::kOutOfBound;
|
|
|
|
} else if (block_upper_bound_check_ ==
|
|
|
|
BlockUpperBound::kUpperBoundBeyondCurBlock) {
|
|
|
|
assert(!is_out_of_bound_);
|
|
|
|
return IterBoundCheck::kInbound;
|
|
|
|
} else {
|
|
|
|
return IterBoundCheck::kUnknown;
|
|
|
|
}
|
2020-03-13 04:39:36 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
|
|
|
|
pinned_iters_mgr_ = pinned_iters_mgr;
|
|
|
|
}
|
|
|
|
bool IsKeyPinned() const override {
|
|
|
|
// Our key comes either from block_iter_'s current key
|
|
|
|
// or index_iter_'s current *value*.
|
|
|
|
return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
|
|
|
|
((is_at_first_key_from_index_ && index_iter_->IsValuePinned()) ||
|
|
|
|
(block_iter_points_to_real_block_ && block_iter_.IsKeyPinned()));
|
|
|
|
}
|
|
|
|
bool IsValuePinned() const override {
|
Properly report IO errors when IndexType::kBinarySearchWithFirstKey is used (#6621)
Summary:
Context: Index type `kBinarySearchWithFirstKey` added the ability for sst file iterator to sometimes report a key from index without reading the corresponding data block. This is useful when sst blocks are cut at some meaningful boundaries (e.g. one block per key prefix), and many seeks land between blocks (e.g. for each prefix, the ranges of keys in different sst files are nearly disjoint, so a typical seek needs to read a data block from only one file even if all files have the prefix). But this added a new error condition, which rocksdb code was really not equipped to deal with: `InternalIterator::value()` may fail with an IO error or Status::Incomplete, but it's just a method returning a Slice, with no way to report error instead. Before this PR, this type of error wasn't handled at all (an empty slice was returned), and kBinarySearchWithFirstKey implementation was considered a prototype.
Now that we (LogDevice) have experimented with kBinarySearchWithFirstKey for a while and confirmed that it's really useful, this PR is adding the missing error handling.
It's a pretty inconvenient situation implementation-wise. The error needs to be reported from InternalIterator when trying to access value. But there are ~700 call sites of `InternalIterator::value()`, most of which either can't hit the error condition (because the iterator is reading from memtable or from index or something) or wouldn't benefit from the deferred loading of the value (e.g. compaction iterator that reads all values anyway). Adding error handling to all these call sites would needlessly bloat the code. So instead I made the deferred value loading optional: only the call sites that may use deferred loading have to call the new method `PrepareValue()` before calling `value()`. The feature is enabled with a new bool argument `allow_unprepared_value` to a bunch of methods that create iterators (it wouldn't make sense to put it in ReadOptions because it's completely internal to iterators, with virtually no user-visible effect). Lmk if you have better ideas.
Note that the deferred value loading only happens for *internal* iterators. The user-visible iterator (DBIter) always prepares the value before returning from Seek/Next/etc. We could go further and add an API to defer that value loading too, but that's most likely not useful for LogDevice, so it doesn't seem worth the complexity for now.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6621
Test Plan: make -j5 check . Will also deploy to some logdevice test clusters and look at stats.
Reviewed By: siying
Differential Revision: D20786930
Pulled By: al13n321
fbshipit-source-id: 6da77d918bad3780522e918f17f4d5513d3e99ee
2020-04-16 00:37:23 +00:00
|
|
|
assert(!is_at_first_key_from_index_);
|
|
|
|
assert(Valid());
|
|
|
|
|
2020-03-13 04:39:36 +00:00
|
|
|
// BlockIter::IsValuePinned() is always true. No need to check
|
|
|
|
return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
|
|
|
|
block_iter_points_to_real_block_;
|
|
|
|
}
|
|
|
|
|
|
|
|
void ResetDataIter() {
|
|
|
|
if (block_iter_points_to_real_block_) {
|
|
|
|
if (pinned_iters_mgr_ != nullptr && pinned_iters_mgr_->PinningEnabled()) {
|
|
|
|
block_iter_.DelegateCleanupsTo(pinned_iters_mgr_);
|
|
|
|
}
|
|
|
|
block_iter_.Invalidate(Status::OK());
|
|
|
|
block_iter_points_to_real_block_ = false;
|
|
|
|
}
|
2020-08-04 18:28:02 +00:00
|
|
|
block_upper_bound_check_ = BlockUpperBound::kUnknown;
|
2020-03-13 04:39:36 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void SavePrevIndexValue() {
|
2023-09-23 01:12:08 +00:00
|
|
|
if (block_iter_points_to_real_block_ && IsIndexAtCurr()) {
|
2020-03-13 04:39:36 +00:00
|
|
|
// Reseek. If they end up with the same data block, we shouldn't re-fetch
|
|
|
|
// the same data block.
|
|
|
|
prev_block_offset_ = index_iter_->value().handle.offset();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-11-11 00:18:27 +00:00
|
|
|
void GetReadaheadState(ReadaheadFileInfo* readahead_file_info) override {
|
|
|
|
if (block_prefetcher_.prefetch_buffer() != nullptr &&
|
|
|
|
read_options_.adaptive_readahead) {
|
|
|
|
block_prefetcher_.prefetch_buffer()->GetReadaheadState(
|
|
|
|
&(readahead_file_info->data_block_readahead_info));
|
|
|
|
if (index_iter_) {
|
|
|
|
index_iter_->GetReadaheadState(readahead_file_info);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void SetReadaheadState(ReadaheadFileInfo* readahead_file_info) override {
|
2021-12-01 06:52:14 +00:00
|
|
|
if (read_options_.adaptive_readahead) {
|
|
|
|
block_prefetcher_.SetReadaheadState(
|
|
|
|
&(readahead_file_info->data_block_readahead_info));
|
|
|
|
if (index_iter_) {
|
|
|
|
index_iter_->SetReadaheadState(readahead_file_info);
|
|
|
|
}
|
2021-11-11 00:18:27 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-12-06 21:48:15 +00:00
|
|
|
FilePrefetchBuffer* prefetch_buffer() {
|
|
|
|
return block_prefetcher_.prefetch_buffer();
|
|
|
|
}
|
|
|
|
|
2021-11-11 00:18:27 +00:00
|
|
|
std::unique_ptr<InternalIteratorBase<IndexValue>> index_iter_;
|
|
|
|
|
2020-03-13 04:39:36 +00:00
|
|
|
private:
|
|
|
|
enum class IterDirection {
|
|
|
|
kForward,
|
|
|
|
kBackward,
|
|
|
|
};
|
2020-08-04 18:28:02 +00:00
|
|
|
// This enum indicates whether the upper bound falls into current block
|
|
|
|
// or beyond.
|
|
|
|
// +-------------+
|
|
|
|
// | cur block | <-- (1)
|
|
|
|
// +-------------+
|
|
|
|
// <-- (2)
|
|
|
|
// --- <boundary key> ---
|
|
|
|
// <-- (3)
|
|
|
|
// +-------------+
|
|
|
|
// | next block | <-- (4)
|
|
|
|
// ......
|
|
|
|
//
|
|
|
|
// When the block is smaller than <boundary key>, kUpperBoundInCurBlock
|
|
|
|
// is the value to use. The examples are (1) or (2) in the graph. It means
|
|
|
|
// all keys in the next block or beyond will be out of bound. Keys within
|
|
|
|
// the current block may or may not be out of bound.
|
|
|
|
// When the block is larger or equal to <boundary key>,
|
|
|
|
// kUpperBoundBeyondCurBlock is to be used. The examples are (3) and (4)
|
|
|
|
// in the graph. It means that all keys in the current block is within the
|
|
|
|
// upper bound and keys in the next block may or may not be within the uppder
|
|
|
|
// bound.
|
|
|
|
// If the boundary key hasn't been checked against the upper bound,
|
|
|
|
// kUnknown can be used.
|
Much better stats for seeks and prefix filtering (#11460)
Summary:
We want to know more about opportunities for better range filters, and the effectiveness of our own range filters. Currently the stats are very limited, essentially logging just hits and misses against prefix filters for range scans in BLOOM_FILTER_PREFIX_* without tracking the false positive rate. Perhaps confusingly, when prefix filters are used for point queries, the stats are currently going into the non-PREFIX tickers.
This change does several things:
* Introduce new stat tickers for seeks and related filtering, \*LEVEL_SEEK\*
* Most importantly, allows us to see opportunities for range filtering. Specifically, we can count how many times a seek in an SST file accesses at least one data block, and how many times at least one value() is then accessed. If a data block was accessed but no value(), we can generally assume that the key(s) seen was(were) not of interest so could have been filtered with the right kind of filter, avoiding the data block access.
* We can get the same level of detail when a filter (for now, prefix Bloom/ribbon) is used, or not. Specifically, we can infer a false positive rate for prefix filters (not available before) from the seek "false positive" rate: when a data block is accessed but no value() is called. (There can be other explanations for a seek false positive, but in typical iterator usage it would indicate a filter false positive.)
* For efficiency, I wanted to avoid making additional calls to the prefix extractor (or key comparisons, etc.), which would be required if we wanted to more precisely detect filter false positives. I believe that instrumenting value() is the best balance of efficiency vs. accurately measuring what we are often interested in.
* The stats are divided between last level and non-last levels, to help understand potential tiered storage use cases.
* The old BLOOM_FILTER_PREFIX_* stats have a different meaning: no longer referring to iterators but to point queries using prefix filters. BLOOM_FILTER_PREFIX_TRUE_POSITIVE is added for computing the prefix false positive rate on point queries, which can be due to filter false positives as well as different keys with the same prefix.
* Similarly, the non-PREFIX BLOOM_FILTER stats are now for whole key filtering only.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11460
Test Plan:
unit tests updated, including updating many to pop the stat value since last read to improve test
readability and maintainability.
Performance test shows a consistent small improvement with these changes, both with clang and with gcc. CPU profile indicates that RecordTick is using less CPU, and this makes sense at least for a high filter miss rate. Before, we were recording two ticks per filter miss in iterators (CHECKED & USEFUL) and now recording just one (FILTERED).
Create DB with
```
TEST_TMPDIR=/dev/shm ./db_bench -benchmarks=fillrandom -num=10000000 -disable_wal=1 -write_buffer_size=30000000 -bloom_bits=8 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=0 -prefix_size=8
```
And run simultaneous before&after with
```
TEST_TMPDIR=/dev/shm ./db_bench -readonly -benchmarks=seekrandom[-X1000] -num=10000000 -bloom_bits=8 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=0 -prefix_size=8 -seek_nexts=1 -duration=20 -seed=43 -threads=8 -cache_size=1000000000 -statistics
```
Before: seekrandom [AVG 275 runs] : 189680 (± 222) ops/sec; 18.4 (± 0.0) MB/sec
After: seekrandom [AVG 275 runs] : 197110 (± 208) ops/sec; 19.1 (± 0.0) MB/sec
Reviewed By: ajkr
Differential Revision: D46029177
Pulled By: pdillinger
fbshipit-source-id: cdace79a2ea548d46c5900b068c5b7c3a02e5822
2023-05-19 22:25:49 +00:00
|
|
|
enum class BlockUpperBound : uint8_t {
|
2020-08-04 18:28:02 +00:00
|
|
|
kUpperBoundInCurBlock,
|
|
|
|
kUpperBoundBeyondCurBlock,
|
|
|
|
kUnknown,
|
|
|
|
};
|
2020-03-13 04:39:36 +00:00
|
|
|
|
Much better stats for seeks and prefix filtering (#11460)
Summary:
We want to know more about opportunities for better range filters, and the effectiveness of our own range filters. Currently the stats are very limited, essentially logging just hits and misses against prefix filters for range scans in BLOOM_FILTER_PREFIX_* without tracking the false positive rate. Perhaps confusingly, when prefix filters are used for point queries, the stats are currently going into the non-PREFIX tickers.
This change does several things:
* Introduce new stat tickers for seeks and related filtering, \*LEVEL_SEEK\*
* Most importantly, allows us to see opportunities for range filtering. Specifically, we can count how many times a seek in an SST file accesses at least one data block, and how many times at least one value() is then accessed. If a data block was accessed but no value(), we can generally assume that the key(s) seen was(were) not of interest so could have been filtered with the right kind of filter, avoiding the data block access.
* We can get the same level of detail when a filter (for now, prefix Bloom/ribbon) is used, or not. Specifically, we can infer a false positive rate for prefix filters (not available before) from the seek "false positive" rate: when a data block is accessed but no value() is called. (There can be other explanations for a seek false positive, but in typical iterator usage it would indicate a filter false positive.)
* For efficiency, I wanted to avoid making additional calls to the prefix extractor (or key comparisons, etc.), which would be required if we wanted to more precisely detect filter false positives. I believe that instrumenting value() is the best balance of efficiency vs. accurately measuring what we are often interested in.
* The stats are divided between last level and non-last levels, to help understand potential tiered storage use cases.
* The old BLOOM_FILTER_PREFIX_* stats have a different meaning: no longer referring to iterators but to point queries using prefix filters. BLOOM_FILTER_PREFIX_TRUE_POSITIVE is added for computing the prefix false positive rate on point queries, which can be due to filter false positives as well as different keys with the same prefix.
* Similarly, the non-PREFIX BLOOM_FILTER stats are now for whole key filtering only.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11460
Test Plan:
unit tests updated, including updating many to pop the stat value since last read to improve test
readability and maintainability.
Performance test shows a consistent small improvement with these changes, both with clang and with gcc. CPU profile indicates that RecordTick is using less CPU, and this makes sense at least for a high filter miss rate. Before, we were recording two ticks per filter miss in iterators (CHECKED & USEFUL) and now recording just one (FILTERED).
Create DB with
```
TEST_TMPDIR=/dev/shm ./db_bench -benchmarks=fillrandom -num=10000000 -disable_wal=1 -write_buffer_size=30000000 -bloom_bits=8 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=0 -prefix_size=8
```
And run simultaneous before&after with
```
TEST_TMPDIR=/dev/shm ./db_bench -readonly -benchmarks=seekrandom[-X1000] -num=10000000 -bloom_bits=8 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=0 -prefix_size=8 -seek_nexts=1 -duration=20 -seed=43 -threads=8 -cache_size=1000000000 -statistics
```
Before: seekrandom [AVG 275 runs] : 189680 (± 222) ops/sec; 18.4 (± 0.0) MB/sec
After: seekrandom [AVG 275 runs] : 197110 (± 208) ops/sec; 19.1 (± 0.0) MB/sec
Reviewed By: ajkr
Differential Revision: D46029177
Pulled By: pdillinger
fbshipit-source-id: cdace79a2ea548d46c5900b068c5b7c3a02e5822
2023-05-19 22:25:49 +00:00
|
|
|
// State bits for collecting stats on seeks and whether they returned useful
|
|
|
|
// results.
|
|
|
|
enum SeekStatState : uint8_t {
|
|
|
|
kNone = 0,
|
|
|
|
// Most recent seek checked prefix filter (or similar future feature)
|
|
|
|
kFilterUsed = 1 << 0,
|
|
|
|
// Already recorded that a data block was accessed since the last seek.
|
|
|
|
kDataBlockReadSinceLastSeek = 1 << 1,
|
|
|
|
// Have not yet recorded that a value() was accessed.
|
|
|
|
kReportOnUseful = 1 << 2,
|
|
|
|
};
|
|
|
|
|
2023-09-23 01:12:08 +00:00
|
|
|
// BlockHandleInfo is used to store the info needed when block cache lookup
|
|
|
|
// ahead is enabled to tune readahead_size.
|
|
|
|
struct BlockHandleInfo {
|
2023-10-17 19:21:08 +00:00
|
|
|
void SetFirstInternalKey(const Slice& key) {
|
|
|
|
if (key.empty()) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
size_t size = key.size();
|
|
|
|
buf_ = std::unique_ptr<char[]>(new char[size]);
|
|
|
|
memcpy(buf_.get(), key.data(), size);
|
|
|
|
first_internal_key_ = Slice(buf_.get(), size);
|
|
|
|
}
|
2023-09-23 01:12:08 +00:00
|
|
|
|
2023-10-17 19:21:08 +00:00
|
|
|
BlockHandle handle_;
|
2023-09-23 01:12:08 +00:00
|
|
|
bool is_cache_hit_ = false;
|
|
|
|
CachableEntry<Block> cachable_entry_;
|
2023-10-17 19:21:08 +00:00
|
|
|
Slice first_internal_key_;
|
|
|
|
std::unique_ptr<char[]> buf_;
|
2023-09-23 01:12:08 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
bool IsIndexAtCurr() const { return is_index_at_curr_block_; }
|
|
|
|
|
2020-03-13 04:39:36 +00:00
|
|
|
const BlockBasedTable* table_;
|
2020-08-03 22:21:56 +00:00
|
|
|
const ReadOptions& read_options_;
|
2020-03-13 04:39:36 +00:00
|
|
|
const InternalKeyComparator& icomp_;
|
|
|
|
UserComparatorWrapper user_comparator_;
|
|
|
|
PinnedIteratorsManager* pinned_iters_mgr_;
|
De-template block based table iterator (#6531)
Summary:
Right now block based table iterator is used as both of iterating data for block based table, and for the index iterator for partitioend index. This was initially convenient for introducing a new iterator and block type for new index format, while reducing code change. However, these two usage doesn't go with each other very well. For example, Prev() is never called for partitioned index iterator, and some other complexity is maintained in block based iterators, which is not needed for index iterator but maintainers will always need to reason about it. Furthermore, the template usage is not following Google C++ Style which we are following, and makes a large chunk of code tangled together. This commit separate the two iterators. Right now, here is what it is done:
1. Copy the block based iterator code into partitioned index iterator, and de-template them.
2. Remove some code not needed for partitioned index. The upper bound check and tricks are removed. We never tested performance for those tricks when partitioned index is enabled in the first place. It's unlikelyl to generate performance regression, as creating new partitioned index block is much rarer than data blocks.
3. Separate out the prefetch logic to a helper class and both classes call them.
This commit will enable future follow-ups. One direction is that we might separate index iterator interface for data blocks and index blocks, as they are quite different.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6531
Test Plan: build using make and cmake. And build release
Differential Revision: D20473108
fbshipit-source-id: e48011783b339a4257c204cc07507b171b834b0f
2020-03-16 19:17:34 +00:00
|
|
|
DataBlockIter block_iter_;
|
2020-08-03 22:21:56 +00:00
|
|
|
const SliceTransform* prefix_extractor_;
|
|
|
|
uint64_t prev_block_offset_ = std::numeric_limits<uint64_t>::max();
|
|
|
|
BlockCacheLookupContext lookup_context_;
|
|
|
|
|
|
|
|
BlockPrefetcher block_prefetcher_;
|
2020-03-13 04:39:36 +00:00
|
|
|
|
2020-08-03 22:21:56 +00:00
|
|
|
const bool allow_unprepared_value_;
|
2020-03-13 04:39:36 +00:00
|
|
|
// True if block_iter_ is initialized and points to the same block
|
|
|
|
// as index iterator.
|
|
|
|
bool block_iter_points_to_real_block_;
|
|
|
|
// See InternalIteratorBase::IsOutOfBound().
|
|
|
|
bool is_out_of_bound_ = false;
|
2020-08-04 18:28:02 +00:00
|
|
|
// How current data block's boundary key with the next block is compared with
|
|
|
|
// iterate upper bound.
|
|
|
|
BlockUpperBound block_upper_bound_check_ = BlockUpperBound::kUnknown;
|
2020-03-13 04:39:36 +00:00
|
|
|
// True if we're standing at the first key of a block, and we haven't loaded
|
Properly report IO errors when IndexType::kBinarySearchWithFirstKey is used (#6621)
Summary:
Context: Index type `kBinarySearchWithFirstKey` added the ability for sst file iterator to sometimes report a key from index without reading the corresponding data block. This is useful when sst blocks are cut at some meaningful boundaries (e.g. one block per key prefix), and many seeks land between blocks (e.g. for each prefix, the ranges of keys in different sst files are nearly disjoint, so a typical seek needs to read a data block from only one file even if all files have the prefix). But this added a new error condition, which rocksdb code was really not equipped to deal with: `InternalIterator::value()` may fail with an IO error or Status::Incomplete, but it's just a method returning a Slice, with no way to report error instead. Before this PR, this type of error wasn't handled at all (an empty slice was returned), and kBinarySearchWithFirstKey implementation was considered a prototype.
Now that we (LogDevice) have experimented with kBinarySearchWithFirstKey for a while and confirmed that it's really useful, this PR is adding the missing error handling.
It's a pretty inconvenient situation implementation-wise. The error needs to be reported from InternalIterator when trying to access value. But there are ~700 call sites of `InternalIterator::value()`, most of which either can't hit the error condition (because the iterator is reading from memtable or from index or something) or wouldn't benefit from the deferred loading of the value (e.g. compaction iterator that reads all values anyway). Adding error handling to all these call sites would needlessly bloat the code. So instead I made the deferred value loading optional: only the call sites that may use deferred loading have to call the new method `PrepareValue()` before calling `value()`. The feature is enabled with a new bool argument `allow_unprepared_value` to a bunch of methods that create iterators (it wouldn't make sense to put it in ReadOptions because it's completely internal to iterators, with virtually no user-visible effect). Lmk if you have better ideas.
Note that the deferred value loading only happens for *internal* iterators. The user-visible iterator (DBIter) always prepares the value before returning from Seek/Next/etc. We could go further and add an API to defer that value loading too, but that's most likely not useful for LogDevice, so it doesn't seem worth the complexity for now.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6621
Test Plan: make -j5 check . Will also deploy to some logdevice test clusters and look at stats.
Reviewed By: siying
Differential Revision: D20786930
Pulled By: al13n321
fbshipit-source-id: 6da77d918bad3780522e918f17f4d5513d3e99ee
2020-04-16 00:37:23 +00:00
|
|
|
// that block yet. A call to PrepareValue() will trigger loading the block.
|
2020-03-13 04:39:36 +00:00
|
|
|
bool is_at_first_key_from_index_ = false;
|
|
|
|
bool check_filter_;
|
|
|
|
// TODO(Zhongyi): pick a better name
|
|
|
|
bool need_upper_bound_check_;
|
|
|
|
|
2022-05-20 23:09:33 +00:00
|
|
|
bool async_read_in_progress_;
|
|
|
|
|
Much better stats for seeks and prefix filtering (#11460)
Summary:
We want to know more about opportunities for better range filters, and the effectiveness of our own range filters. Currently the stats are very limited, essentially logging just hits and misses against prefix filters for range scans in BLOOM_FILTER_PREFIX_* without tracking the false positive rate. Perhaps confusingly, when prefix filters are used for point queries, the stats are currently going into the non-PREFIX tickers.
This change does several things:
* Introduce new stat tickers for seeks and related filtering, \*LEVEL_SEEK\*
* Most importantly, allows us to see opportunities for range filtering. Specifically, we can count how many times a seek in an SST file accesses at least one data block, and how many times at least one value() is then accessed. If a data block was accessed but no value(), we can generally assume that the key(s) seen was(were) not of interest so could have been filtered with the right kind of filter, avoiding the data block access.
* We can get the same level of detail when a filter (for now, prefix Bloom/ribbon) is used, or not. Specifically, we can infer a false positive rate for prefix filters (not available before) from the seek "false positive" rate: when a data block is accessed but no value() is called. (There can be other explanations for a seek false positive, but in typical iterator usage it would indicate a filter false positive.)
* For efficiency, I wanted to avoid making additional calls to the prefix extractor (or key comparisons, etc.), which would be required if we wanted to more precisely detect filter false positives. I believe that instrumenting value() is the best balance of efficiency vs. accurately measuring what we are often interested in.
* The stats are divided between last level and non-last levels, to help understand potential tiered storage use cases.
* The old BLOOM_FILTER_PREFIX_* stats have a different meaning: no longer referring to iterators but to point queries using prefix filters. BLOOM_FILTER_PREFIX_TRUE_POSITIVE is added for computing the prefix false positive rate on point queries, which can be due to filter false positives as well as different keys with the same prefix.
* Similarly, the non-PREFIX BLOOM_FILTER stats are now for whole key filtering only.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11460
Test Plan:
unit tests updated, including updating many to pop the stat value since last read to improve test
readability and maintainability.
Performance test shows a consistent small improvement with these changes, both with clang and with gcc. CPU profile indicates that RecordTick is using less CPU, and this makes sense at least for a high filter miss rate. Before, we were recording two ticks per filter miss in iterators (CHECKED & USEFUL) and now recording just one (FILTERED).
Create DB with
```
TEST_TMPDIR=/dev/shm ./db_bench -benchmarks=fillrandom -num=10000000 -disable_wal=1 -write_buffer_size=30000000 -bloom_bits=8 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=0 -prefix_size=8
```
And run simultaneous before&after with
```
TEST_TMPDIR=/dev/shm ./db_bench -readonly -benchmarks=seekrandom[-X1000] -num=10000000 -bloom_bits=8 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=0 -prefix_size=8 -seek_nexts=1 -duration=20 -seed=43 -threads=8 -cache_size=1000000000 -statistics
```
Before: seekrandom [AVG 275 runs] : 189680 (± 222) ops/sec; 18.4 (± 0.0) MB/sec
After: seekrandom [AVG 275 runs] : 197110 (± 208) ops/sec; 19.1 (± 0.0) MB/sec
Reviewed By: ajkr
Differential Revision: D46029177
Pulled By: pdillinger
fbshipit-source-id: cdace79a2ea548d46c5900b068c5b7c3a02e5822
2023-05-19 22:25:49 +00:00
|
|
|
mutable SeekStatState seek_stat_state_ = SeekStatState::kNone;
|
|
|
|
bool is_last_level_;
|
|
|
|
|
2023-09-23 01:12:08 +00:00
|
|
|
// If set to true, it'll lookup in the cache ahead to estimate the readahead
|
|
|
|
// size based on cache hit and miss.
|
|
|
|
bool readahead_cache_lookup_ = false;
|
|
|
|
|
|
|
|
// It stores all the block handles that are lookuped in cache ahead when
|
|
|
|
// BlockCacheLookupForReadAheadSize is called. Since index_iter_ may point to
|
|
|
|
// different blocks when readahead_size is calculated in
|
|
|
|
// BlockCacheLookupForReadAheadSize, to avoid index_iter_ reseek,
|
|
|
|
// block_handles_ is used.
|
2024-05-04 00:18:13 +00:00
|
|
|
// `block_handles_` is lazily constructed to save CPU when it is unused
|
|
|
|
std::unique_ptr<std::deque<BlockHandleInfo>> block_handles_;
|
2023-09-23 01:12:08 +00:00
|
|
|
|
|
|
|
// During cache lookup to find readahead size, index_iter_ is iterated and it
|
|
|
|
// can point to a different block. is_index_at_curr_block_ keeps track of
|
|
|
|
// that.
|
|
|
|
bool is_index_at_curr_block_ = true;
|
2023-10-03 00:47:24 +00:00
|
|
|
bool is_index_out_of_bound_ = false;
|
|
|
|
|
|
|
|
// Used in case of auto_readahead_size to disable the block_cache lookup if
|
|
|
|
// direction is reversed from forward to backward. In case of backward
|
|
|
|
// direction, SeekForPrev or Prev might call Seek from db_iter. So direction
|
|
|
|
// is used to disable the lookup.
|
|
|
|
IterDirection direction_ = IterDirection::kForward;
|
2023-09-23 01:12:08 +00:00
|
|
|
|
2024-10-17 22:52:55 +00:00
|
|
|
// The prefix of the key called with SeekImpl().
|
|
|
|
// This is for readahead trimming so no data blocks containing keys of a
|
|
|
|
// different prefix are prefetched
|
|
|
|
std::string seek_key_prefix_for_readahead_trimming_ = "";
|
|
|
|
|
2023-12-06 21:48:15 +00:00
|
|
|
void SeekSecondPass(const Slice* target);
|
|
|
|
|
2020-03-13 04:39:36 +00:00
|
|
|
// If `target` is null, seek to first.
|
2022-05-20 23:09:33 +00:00
|
|
|
void SeekImpl(const Slice* target, bool async_prefetch);
|
2020-03-13 04:39:36 +00:00
|
|
|
|
|
|
|
void InitDataBlock();
|
2022-05-20 23:09:33 +00:00
|
|
|
void AsyncInitDataBlock(bool is_first_pass);
|
2020-03-13 04:39:36 +00:00
|
|
|
bool MaterializeCurrentBlock();
|
|
|
|
void FindKeyForward();
|
|
|
|
void FindBlockForward();
|
|
|
|
void FindKeyBackward();
|
|
|
|
void CheckOutOfBound();
|
|
|
|
|
|
|
|
// Check if data block is fully within iterate_upper_bound.
|
|
|
|
//
|
|
|
|
// Note MyRocks may update iterate bounds between seek. To workaround it,
|
|
|
|
// we need to check and update data_block_within_upper_bound_ accordingly.
|
|
|
|
void CheckDataBlockWithinUpperBound();
|
|
|
|
|
Much better stats for seeks and prefix filtering (#11460)
Summary:
We want to know more about opportunities for better range filters, and the effectiveness of our own range filters. Currently the stats are very limited, essentially logging just hits and misses against prefix filters for range scans in BLOOM_FILTER_PREFIX_* without tracking the false positive rate. Perhaps confusingly, when prefix filters are used for point queries, the stats are currently going into the non-PREFIX tickers.
This change does several things:
* Introduce new stat tickers for seeks and related filtering, \*LEVEL_SEEK\*
* Most importantly, allows us to see opportunities for range filtering. Specifically, we can count how many times a seek in an SST file accesses at least one data block, and how many times at least one value() is then accessed. If a data block was accessed but no value(), we can generally assume that the key(s) seen was(were) not of interest so could have been filtered with the right kind of filter, avoiding the data block access.
* We can get the same level of detail when a filter (for now, prefix Bloom/ribbon) is used, or not. Specifically, we can infer a false positive rate for prefix filters (not available before) from the seek "false positive" rate: when a data block is accessed but no value() is called. (There can be other explanations for a seek false positive, but in typical iterator usage it would indicate a filter false positive.)
* For efficiency, I wanted to avoid making additional calls to the prefix extractor (or key comparisons, etc.), which would be required if we wanted to more precisely detect filter false positives. I believe that instrumenting value() is the best balance of efficiency vs. accurately measuring what we are often interested in.
* The stats are divided between last level and non-last levels, to help understand potential tiered storage use cases.
* The old BLOOM_FILTER_PREFIX_* stats have a different meaning: no longer referring to iterators but to point queries using prefix filters. BLOOM_FILTER_PREFIX_TRUE_POSITIVE is added for computing the prefix false positive rate on point queries, which can be due to filter false positives as well as different keys with the same prefix.
* Similarly, the non-PREFIX BLOOM_FILTER stats are now for whole key filtering only.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11460
Test Plan:
unit tests updated, including updating many to pop the stat value since last read to improve test
readability and maintainability.
Performance test shows a consistent small improvement with these changes, both with clang and with gcc. CPU profile indicates that RecordTick is using less CPU, and this makes sense at least for a high filter miss rate. Before, we were recording two ticks per filter miss in iterators (CHECKED & USEFUL) and now recording just one (FILTERED).
Create DB with
```
TEST_TMPDIR=/dev/shm ./db_bench -benchmarks=fillrandom -num=10000000 -disable_wal=1 -write_buffer_size=30000000 -bloom_bits=8 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=0 -prefix_size=8
```
And run simultaneous before&after with
```
TEST_TMPDIR=/dev/shm ./db_bench -readonly -benchmarks=seekrandom[-X1000] -num=10000000 -bloom_bits=8 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=0 -prefix_size=8 -seek_nexts=1 -duration=20 -seed=43 -threads=8 -cache_size=1000000000 -statistics
```
Before: seekrandom [AVG 275 runs] : 189680 (± 222) ops/sec; 18.4 (± 0.0) MB/sec
After: seekrandom [AVG 275 runs] : 197110 (± 208) ops/sec; 19.1 (± 0.0) MB/sec
Reviewed By: ajkr
Differential Revision: D46029177
Pulled By: pdillinger
fbshipit-source-id: cdace79a2ea548d46c5900b068c5b7c3a02e5822
2023-05-19 22:25:49 +00:00
|
|
|
bool CheckPrefixMayMatch(const Slice& ikey, IterDirection direction,
|
|
|
|
bool* filter_checked) {
|
2020-03-13 04:39:36 +00:00
|
|
|
if (need_upper_bound_check_ && direction == IterDirection::kBackward) {
|
2020-12-01 22:05:19 +00:00
|
|
|
// Upper bound check isn't sufficient for backward direction to
|
2020-03-13 04:39:36 +00:00
|
|
|
// guarantee the same result as total order, so disable prefix
|
|
|
|
// check.
|
|
|
|
return true;
|
|
|
|
}
|
Much better stats for seeks and prefix filtering (#11460)
Summary:
We want to know more about opportunities for better range filters, and the effectiveness of our own range filters. Currently the stats are very limited, essentially logging just hits and misses against prefix filters for range scans in BLOOM_FILTER_PREFIX_* without tracking the false positive rate. Perhaps confusingly, when prefix filters are used for point queries, the stats are currently going into the non-PREFIX tickers.
This change does several things:
* Introduce new stat tickers for seeks and related filtering, \*LEVEL_SEEK\*
* Most importantly, allows us to see opportunities for range filtering. Specifically, we can count how many times a seek in an SST file accesses at least one data block, and how many times at least one value() is then accessed. If a data block was accessed but no value(), we can generally assume that the key(s) seen was(were) not of interest so could have been filtered with the right kind of filter, avoiding the data block access.
* We can get the same level of detail when a filter (for now, prefix Bloom/ribbon) is used, or not. Specifically, we can infer a false positive rate for prefix filters (not available before) from the seek "false positive" rate: when a data block is accessed but no value() is called. (There can be other explanations for a seek false positive, but in typical iterator usage it would indicate a filter false positive.)
* For efficiency, I wanted to avoid making additional calls to the prefix extractor (or key comparisons, etc.), which would be required if we wanted to more precisely detect filter false positives. I believe that instrumenting value() is the best balance of efficiency vs. accurately measuring what we are often interested in.
* The stats are divided between last level and non-last levels, to help understand potential tiered storage use cases.
* The old BLOOM_FILTER_PREFIX_* stats have a different meaning: no longer referring to iterators but to point queries using prefix filters. BLOOM_FILTER_PREFIX_TRUE_POSITIVE is added for computing the prefix false positive rate on point queries, which can be due to filter false positives as well as different keys with the same prefix.
* Similarly, the non-PREFIX BLOOM_FILTER stats are now for whole key filtering only.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11460
Test Plan:
unit tests updated, including updating many to pop the stat value since last read to improve test
readability and maintainability.
Performance test shows a consistent small improvement with these changes, both with clang and with gcc. CPU profile indicates that RecordTick is using less CPU, and this makes sense at least for a high filter miss rate. Before, we were recording two ticks per filter miss in iterators (CHECKED & USEFUL) and now recording just one (FILTERED).
Create DB with
```
TEST_TMPDIR=/dev/shm ./db_bench -benchmarks=fillrandom -num=10000000 -disable_wal=1 -write_buffer_size=30000000 -bloom_bits=8 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=0 -prefix_size=8
```
And run simultaneous before&after with
```
TEST_TMPDIR=/dev/shm ./db_bench -readonly -benchmarks=seekrandom[-X1000] -num=10000000 -bloom_bits=8 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=0 -prefix_size=8 -seek_nexts=1 -duration=20 -seed=43 -threads=8 -cache_size=1000000000 -statistics
```
Before: seekrandom [AVG 275 runs] : 189680 (± 222) ops/sec; 18.4 (± 0.0) MB/sec
After: seekrandom [AVG 275 runs] : 197110 (± 208) ops/sec; 19.1 (± 0.0) MB/sec
Reviewed By: ajkr
Differential Revision: D46029177
Pulled By: pdillinger
fbshipit-source-id: cdace79a2ea548d46c5900b068c5b7c3a02e5822
2023-05-19 22:25:49 +00:00
|
|
|
if (check_filter_ &&
|
|
|
|
!table_->PrefixRangeMayMatch(ikey, read_options_, prefix_extractor_,
|
|
|
|
need_upper_bound_check_, &lookup_context_,
|
|
|
|
filter_checked)) {
|
2020-03-13 04:39:36 +00:00
|
|
|
// TODO remember the iterator is invalidated because of prefix
|
|
|
|
// match. This can avoid the upper level file iterator to falsely
|
|
|
|
// believe the position is the end of the SST file and move to
|
|
|
|
// the first key of the next file.
|
|
|
|
ResetDataIter();
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
2023-08-18 22:52:04 +00:00
|
|
|
|
2023-09-23 01:12:08 +00:00
|
|
|
// *** BEGIN APIs relevant to auto tuning of readahead_size ***
|
|
|
|
|
2023-12-06 21:48:15 +00:00
|
|
|
// This API is called to lookup the data blocks ahead in the cache to tune
|
|
|
|
// the start and end offsets passed.
|
|
|
|
void BlockCacheLookupForReadAheadSize(bool read_curr_block,
|
|
|
|
uint64_t& start_offset,
|
|
|
|
uint64_t& end_offset);
|
2023-09-23 01:12:08 +00:00
|
|
|
|
|
|
|
void ResetBlockCacheLookupVar() {
|
2023-10-03 00:47:24 +00:00
|
|
|
is_index_out_of_bound_ = false;
|
2023-09-23 01:12:08 +00:00
|
|
|
readahead_cache_lookup_ = false;
|
|
|
|
ClearBlockHandles();
|
|
|
|
}
|
|
|
|
|
2024-10-17 22:52:55 +00:00
|
|
|
bool IsNextBlockOutOfReadaheadBound() {
|
|
|
|
const Slice& index_iter_user_key = index_iter_->user_key();
|
2023-09-23 01:12:08 +00:00
|
|
|
// If curr block's index key >= iterate_upper_bound, it means all the keys
|
|
|
|
// in next block or above are out of bound.
|
2024-10-17 22:52:55 +00:00
|
|
|
bool out_of_upper_bound =
|
|
|
|
read_options_.iterate_upper_bound != nullptr &&
|
|
|
|
(user_comparator_.CompareWithoutTimestamp(
|
|
|
|
index_iter_user_key,
|
|
|
|
/*a_has_ts=*/true, *read_options_.iterate_upper_bound,
|
|
|
|
/*b_has_ts=*/false) >= 0
|
|
|
|
? true
|
|
|
|
: false);
|
|
|
|
if (out_of_upper_bound) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
// If curr block's index key has a different prefix from the seek key's, it
|
|
|
|
// means all the keys in next block or above has a different prefix from the
|
|
|
|
// seek key's.
|
|
|
|
bool out_of_prefix_bound =
|
|
|
|
(read_options_.prefix_same_as_start &&
|
|
|
|
!seek_key_prefix_for_readahead_trimming_.empty() &&
|
|
|
|
(prefix_extractor_->InDomain(index_iter_user_key)
|
|
|
|
? (prefix_extractor_->Transform(index_iter_user_key)
|
|
|
|
.compare(seek_key_prefix_for_readahead_trimming_) != 0)
|
|
|
|
: user_comparator_.CompareWithoutTimestamp(
|
|
|
|
index_iter_user_key,
|
|
|
|
/*a_has_ts=*/true, seek_key_prefix_for_readahead_trimming_,
|
|
|
|
/*b_has_ts=*/false) > 0));
|
|
|
|
|
|
|
|
if (out_of_prefix_bound) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
2023-09-23 01:12:08 +00:00
|
|
|
}
|
|
|
|
|
2024-05-04 00:18:13 +00:00
|
|
|
void ClearBlockHandles() {
|
|
|
|
if (block_handles_ != nullptr) {
|
|
|
|
block_handles_->clear();
|
|
|
|
}
|
|
|
|
}
|
2023-09-23 01:12:08 +00:00
|
|
|
|
|
|
|
// Reset prev_block_offset_. If index_iter_ has moved ahead, it won't get
|
|
|
|
// accurate prev_block_offset_.
|
|
|
|
void ResetPreviousBlockOffset() {
|
|
|
|
prev_block_offset_ = std::numeric_limits<uint64_t>::max();
|
|
|
|
}
|
|
|
|
|
2024-05-04 00:18:13 +00:00
|
|
|
bool DoesContainBlockHandles() {
|
|
|
|
return block_handles_ != nullptr && !block_handles_->empty();
|
|
|
|
}
|
2023-09-23 01:12:08 +00:00
|
|
|
|
2023-12-06 21:48:15 +00:00
|
|
|
void InitializeStartAndEndOffsets(bool read_curr_block,
|
|
|
|
bool& found_first_miss_block,
|
|
|
|
uint64_t& start_updated_offset,
|
|
|
|
uint64_t& end_updated_offset,
|
|
|
|
size_t& prev_handles_size);
|
2023-09-23 01:12:08 +00:00
|
|
|
// *** END APIs relevant to auto tuning of readahead_size ***
|
2020-03-13 04:39:36 +00:00
|
|
|
};
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|