2020-04-24 22:30:12 +00:00
|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
|
|
|
|
#include "table/block_fetcher.h"
|
2020-07-09 21:33:42 +00:00
|
|
|
|
2020-04-24 22:30:12 +00:00
|
|
|
#include "db/table_properties_collector.h"
|
2020-07-09 21:33:42 +00:00
|
|
|
#include "file/file_util.h"
|
2020-04-24 22:30:12 +00:00
|
|
|
#include "options/options_helper.h"
|
|
|
|
#include "port/port.h"
|
|
|
|
#include "port/stack_trace.h"
|
2021-09-29 11:01:57 +00:00
|
|
|
#include "rocksdb/db.h"
|
2021-01-29 06:08:46 +00:00
|
|
|
#include "rocksdb/file_system.h"
|
2020-04-24 22:30:12 +00:00
|
|
|
#include "table/block_based/binary_search_index_reader.h"
|
|
|
|
#include "table/block_based/block_based_table_builder.h"
|
|
|
|
#include "table/block_based/block_based_table_factory.h"
|
|
|
|
#include "table/block_based/block_based_table_reader.h"
|
|
|
|
#include "table/format.h"
|
|
|
|
#include "test_util/testharness.h"
|
2021-12-17 12:19:34 +00:00
|
|
|
#include "utilities/memory_allocators.h"
|
2020-04-24 22:30:12 +00:00
|
|
|
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
namespace {
|
|
|
|
struct MemcpyStats {
|
2020-06-30 22:38:59 +00:00
|
|
|
int num_stack_buf_memcpy;
|
|
|
|
int num_heap_buf_memcpy;
|
|
|
|
int num_compressed_buf_memcpy;
|
2020-04-24 22:30:12 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
struct BufAllocationStats {
|
2020-06-30 22:38:59 +00:00
|
|
|
int num_heap_buf_allocations;
|
|
|
|
int num_compressed_buf_allocations;
|
2020-04-24 22:30:12 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
struct TestStats {
|
|
|
|
MemcpyStats memcpy_stats;
|
|
|
|
BufAllocationStats buf_allocation_stats;
|
|
|
|
};
|
|
|
|
|
|
|
|
class BlockFetcherTest : public testing::Test {
|
2020-06-30 22:38:59 +00:00
|
|
|
public:
|
|
|
|
enum class Mode {
|
|
|
|
kBufferedRead = 0,
|
|
|
|
kBufferedMmap,
|
|
|
|
kDirectRead,
|
|
|
|
kNumModes,
|
|
|
|
};
|
|
|
|
// use NumModes as array size to avoid "size of array '...' has non-integral
|
|
|
|
// type" errors.
|
|
|
|
const static int NumModes = static_cast<int>(Mode::kNumModes);
|
|
|
|
|
2020-04-24 22:30:12 +00:00
|
|
|
protected:
|
|
|
|
void SetUp() override {
|
2020-07-09 21:33:42 +00:00
|
|
|
SetupSyncPointsToMockDirectIO();
|
2020-04-24 22:30:12 +00:00
|
|
|
test_dir_ = test::PerThreadDBPath("block_fetcher_test");
|
|
|
|
env_ = Env::Default();
|
|
|
|
fs_ = FileSystem::Default();
|
|
|
|
ASSERT_OK(fs_->CreateDir(test_dir_, IOOptions(), nullptr));
|
|
|
|
}
|
|
|
|
|
2020-07-09 21:33:42 +00:00
|
|
|
void TearDown() override { EXPECT_OK(DestroyDir(env_, test_dir_)); }
|
2020-04-24 22:30:12 +00:00
|
|
|
|
2020-06-30 22:38:59 +00:00
|
|
|
void AssertSameBlock(const std::string& block1, const std::string& block2) {
|
|
|
|
ASSERT_EQ(block1, block2);
|
2020-04-24 22:30:12 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Creates a table with kv pairs (i, i) where i ranges from 0 to 9, inclusive.
|
|
|
|
void CreateTable(const std::string& table_name,
|
|
|
|
const CompressionType& compression_type) {
|
|
|
|
std::unique_ptr<WritableFileWriter> writer;
|
|
|
|
NewFileWriter(table_name, &writer);
|
|
|
|
|
|
|
|
// Create table builder.
|
2021-05-05 20:59:21 +00:00
|
|
|
ImmutableOptions ioptions(options_);
|
2020-06-30 22:38:59 +00:00
|
|
|
InternalKeyComparator comparator(options_.comparator);
|
|
|
|
ColumnFamilyOptions cf_options(options_);
|
2020-04-24 22:30:12 +00:00
|
|
|
MutableCFOptions moptions(cf_options);
|
2021-05-18 01:27:42 +00:00
|
|
|
IntTblPropCollectorFactories factories;
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
const ReadOptions read_options;
|
|
|
|
const WriteOptions write_options;
|
2020-04-24 22:30:12 +00:00
|
|
|
std::unique_ptr<TableBuilder> table_builder(table_factory_.NewTableBuilder(
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
TableBuilderOptions(ioptions, moptions, read_options, write_options,
|
|
|
|
comparator, &factories, compression_type,
|
|
|
|
CompressionOptions(), 0 /* column_family_id */,
|
|
|
|
kDefaultColumnFamilyName, -1 /* level */),
|
2021-04-29 13:59:53 +00:00
|
|
|
writer.get()));
|
2020-04-24 22:30:12 +00:00
|
|
|
|
|
|
|
// Build table.
|
|
|
|
for (int i = 0; i < 9; i++) {
|
|
|
|
std::string key = ToInternalKey(std::to_string(i));
|
Fix testcase failures on windows (#7992)
Summary:
Fixed 5 test case failures found on Windows 10/Windows Server 2016
1. In `flush_job_test`, the DestroyDir function fails in deconstructor because some file handles are still being held by VersionSet. This happens on Windows Server 2016, so need to manually reset versions_ pointer to release all file handles.
2. In `StatsHistoryTest.InMemoryStatsHistoryPurging` test, the capping memory cost of stats_history_size on Windows becomes 14000 bytes with latest changes, not just 13000 bytes.
3. In `SSTDumpToolTest.RawOutput` test, the output file handle is not closed at the end.
4. In `FullBloomTest.OptimizeForMemory` test, ROCKSDB_MALLOC_USABLE_SIZE is undefined on windows so `total_mem` is always equal to `total_size`. The internal memory fragmentation assertion does not apply in this case.
5. In `BlockFetcherTest.FetchAndUncompressCompressedDataBlock` test, XPRESS cannot reach 87.5% compression ratio with original CreateTable method, so I append extra zeros to the string value to enhance compression ratio. Beside, since XPRESS allocates memory internally, thus does not support for custom allocator verification, we will skip the allocator verification for XPRESS
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7992
Reviewed By: jay-zhuang
Differential Revision: D26615283
Pulled By: ajkr
fbshipit-source-id: 3632612f84b99e2b9c77c403b112b6bedf3b125d
2021-02-23 22:31:50 +00:00
|
|
|
// Append "00000000" to string value to enhance compression ratio
|
|
|
|
std::string value = "00000000" + std::to_string(i);
|
2020-04-24 22:30:12 +00:00
|
|
|
table_builder->Add(key, value);
|
|
|
|
}
|
|
|
|
ASSERT_OK(table_builder->Finish());
|
|
|
|
}
|
|
|
|
|
2020-06-30 22:38:59 +00:00
|
|
|
void FetchIndexBlock(const std::string& table_name,
|
2020-04-24 22:30:12 +00:00
|
|
|
CountedMemoryAllocator* heap_buf_allocator,
|
|
|
|
CountedMemoryAllocator* compressed_buf_allocator,
|
2020-06-30 22:38:59 +00:00
|
|
|
MemcpyStats* memcpy_stats, BlockContents* index_block,
|
|
|
|
std::string* result) {
|
|
|
|
FileOptions fopt(options_);
|
2020-04-24 22:30:12 +00:00
|
|
|
std::unique_ptr<RandomAccessFileReader> file;
|
|
|
|
NewFileReader(table_name, fopt, &file);
|
|
|
|
|
|
|
|
// Get handle of the index block.
|
|
|
|
Footer footer;
|
|
|
|
ReadFooter(file.get(), &footer);
|
|
|
|
const BlockHandle& index_handle = footer.index_handle();
|
format_version=6 and context-aware block checksums (#9058)
Summary:
## Context checksum
All RocksDB checksums currently use 32 bits of checking
power, which should be 1 in 4 billion false negative (FN) probability (failing to
detect corruption). This is true for random corruptions, and in some cases
small corruptions are guaranteed to be detected. But some possible
corruptions, such as in storage metadata rather than storage payload data,
would have a much higher FN rate. For example:
* Data larger than one SST block is replaced by data from elsewhere in
the same or another SST file. Especially with block_align=true, the
probability of exact block size match is probably around 1 in 100, making
the FN probability around that same. Without `block_align=true` the
probability of same block start location is probably around 1 in 10,000,
for FN probability around 1 in a million.
To solve this problem in new format_version=6, we add "context awareness"
to block checksum checks. The stored and expected checksum value is
modified based on the block's position in the file and which file it is in. The
modifications are cleverly chosen so that, for example
* blocks within about 4GB of each other are guaranteed to use different context
* blocks that are offset by exactly some multiple of 4GiB are guaranteed to use
different context
* files generated by the same process are guaranteed to use different context
for the same offsets, until wrap-around after 2^32 - 1 files
Thus, with format_version=6, if a valid SST block and checksum is misplaced,
its checksum FN probability should be essentially ideal, 1 in 4B.
## Footer checksum
This change also adds checksum protection to the SST footer (with
format_version=6), for the first time without relying on whole file checksum.
To prevent a corruption of the format_version in the footer (e.g. 6 -> 5) to
defeat the footer checksum, we change much of the footer data format
including an "extended magic number" in format_version 6 that would be
interpreted as empty index and metaindex block handles in older footer
versions. We also change the encoding of handles to free up space for
other new data in footer.
## More detail: making space in footer
In order to keep footer the same size in format_version=6 (avoid change to IO
patterns), we have to free up some space for new data. We do this two ways:
* Metaindex block handle is encoded down to 4 bytes (from 10) by assuming
it immediately precedes the footer, and by assuming it is < 4GB.
* Index block handle is moved into metaindex. (I don't know why it was
in footer to begin with.)
## Performance
In case of small performance penalty, I've made a "pay as you go" optimization
to compensate: replace `MutableCFOptions` in BlockBasedTableBuilder::Rep
with the only field used in that structure after construction: `prefix_extractor`.
This makes the PR an overall performance improvement (results below).
Nevertheless I'm seeing essentially no difference going from fv=5 to fv=6,
even including that improvement for both. That's based on extreme case table
write performance testing, many files with many blocks. This is relatively
checksum intensive (small blocks) and salt generation intensive (small files).
```
(for I in `seq 1 100`; do TEST_TMPDIR=/dev/shm/dbbench2 ./db_bench -benchmarks=fillseq -memtablerep=vector -disable_wal=1 -allow_concurrent_memtable_write=false -num=3000000 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=0 -write_buffer_size=100000 -compression_type=none -block_size=1000; done) 2>&1 | grep micros/op | tee out
awk '{ tot += $5; n += 1; } END { print int(1.0 * tot / n) }' < out
```
Each value below is ops/s averaged over 100 runs, run simultaneously with competing
configuration for load fairness
Before -> after (both fv=5): 483530 -> 483673 (negligible)
Re-run 1: 480733 -> 485427 (1.0% faster)
Re-run 2: 483821 -> 484541 (0.1% faster)
Before (fv=5) -> after (fv=6): 482006 -> 485100 (0.6% faster)
Re-run 1: 482212 -> 485075 (0.6% faster)
Re-run 2: 483590 -> 484073 (0.1% faster)
After fv=5 -> after fv=6: 483878 -> 485542 (0.3% faster)
Re-run 1: 485331 -> 483385 (0.4% slower)
Re-run 2: 485283 -> 483435 (0.4% slower)
Re-run 3: 483647 -> 486109 (0.5% faster)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9058
Test Plan:
unit tests included (table_test, db_properties_test, salt in env_test). General DB tests
and crash test updated to test new format_version.
Also temporarily updated the default format version to 6 and saw some test failures. Almost all
were due to an inadvertent additional read in VerifyChecksum to verify the index block checksum,
though it's arguably a bug that VerifyChecksum does not appear to (re-)verify the index block
checksum, just assuming it was verified in opening the index reader (probably *usually* true but
probably not always true). Some other concerns about VerifyChecksum are left in FIXME
comments. The only remaining test failure on change of default (in block_fetcher_test) now
has a comment about how to upgrade the test.
The format compatibility test does not need updating because we have not updated the default
format_version.
Reviewed By: ajkr, mrambacher
Differential Revision: D33100915
Pulled By: pdillinger
fbshipit-source-id: 8679e3e572fa580181a737fd6d113ed53c5422ee
2023-07-30 23:40:01 +00:00
|
|
|
// FIXME: index handle will need to come from metaindex for
|
|
|
|
// format_version >= 6 when that becomes the default
|
|
|
|
ASSERT_FALSE(index_handle.IsNull());
|
2020-04-24 22:30:12 +00:00
|
|
|
|
|
|
|
CompressionType compression_type;
|
|
|
|
FetchBlock(file.get(), index_handle, BlockType::kIndex,
|
|
|
|
false /* compressed */, false /* do_uncompress */,
|
|
|
|
heap_buf_allocator, compressed_buf_allocator, index_block,
|
|
|
|
memcpy_stats, &compression_type);
|
|
|
|
ASSERT_EQ(compression_type, CompressionType::kNoCompression);
|
2020-06-30 22:38:59 +00:00
|
|
|
result->assign(index_block->data.ToString());
|
2020-04-24 22:30:12 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Fetches the first data block in both direct IO and non-direct IO mode.
|
|
|
|
//
|
|
|
|
// compressed: whether the data blocks are compressed;
|
|
|
|
// do_uncompress: whether the data blocks should be uncompressed on fetching.
|
|
|
|
// compression_type: the expected compression type.
|
|
|
|
//
|
|
|
|
// Expects:
|
|
|
|
// Block contents are the same.
|
|
|
|
// Bufferr allocation and memory copy statistics are expected.
|
2020-06-30 22:38:59 +00:00
|
|
|
void TestFetchDataBlock(
|
|
|
|
const std::string& table_name_prefix, bool compressed, bool do_uncompress,
|
|
|
|
std::array<TestStats, NumModes> expected_stats_by_mode) {
|
2020-04-24 22:30:12 +00:00
|
|
|
for (CompressionType compression_type : GetSupportedCompressions()) {
|
|
|
|
bool do_compress = compression_type != kNoCompression;
|
2023-12-01 19:10:30 +00:00
|
|
|
if (compressed != do_compress) {
|
|
|
|
continue;
|
|
|
|
}
|
2020-04-24 22:30:12 +00:00
|
|
|
std::string compression_type_str =
|
|
|
|
CompressionTypeToString(compression_type);
|
|
|
|
|
|
|
|
std::string table_name = table_name_prefix + compression_type_str;
|
|
|
|
CreateTable(table_name, compression_type);
|
|
|
|
|
|
|
|
CompressionType expected_compression_type_after_fetch =
|
|
|
|
(compressed && !do_uncompress) ? compression_type : kNoCompression;
|
|
|
|
|
2020-06-30 22:38:59 +00:00
|
|
|
BlockContents blocks[NumModes];
|
|
|
|
std::string block_datas[NumModes];
|
|
|
|
MemcpyStats memcpy_stats[NumModes];
|
|
|
|
CountedMemoryAllocator heap_buf_allocators[NumModes];
|
|
|
|
CountedMemoryAllocator compressed_buf_allocators[NumModes];
|
|
|
|
for (int i = 0; i < NumModes; ++i) {
|
|
|
|
SetMode(static_cast<Mode>(i));
|
|
|
|
FetchFirstDataBlock(table_name, compressed, do_uncompress,
|
|
|
|
expected_compression_type_after_fetch,
|
|
|
|
&heap_buf_allocators[i],
|
|
|
|
&compressed_buf_allocators[i], &blocks[i],
|
|
|
|
&block_datas[i], &memcpy_stats[i]);
|
2020-04-24 22:30:12 +00:00
|
|
|
}
|
|
|
|
|
2020-06-30 22:38:59 +00:00
|
|
|
for (int i = 0; i < NumModes - 1; ++i) {
|
|
|
|
AssertSameBlock(block_datas[i], block_datas[i + 1]);
|
|
|
|
}
|
2020-04-24 22:30:12 +00:00
|
|
|
|
|
|
|
// Check memcpy and buffer allocation statistics.
|
2020-06-30 22:38:59 +00:00
|
|
|
for (int i = 0; i < NumModes; ++i) {
|
|
|
|
const TestStats& expected_stats = expected_stats_by_mode[i];
|
2020-04-24 22:30:12 +00:00
|
|
|
|
2020-06-30 22:38:59 +00:00
|
|
|
ASSERT_EQ(memcpy_stats[i].num_stack_buf_memcpy,
|
2020-04-24 22:30:12 +00:00
|
|
|
expected_stats.memcpy_stats.num_stack_buf_memcpy);
|
2020-06-30 22:38:59 +00:00
|
|
|
ASSERT_EQ(memcpy_stats[i].num_heap_buf_memcpy,
|
2020-04-24 22:30:12 +00:00
|
|
|
expected_stats.memcpy_stats.num_heap_buf_memcpy);
|
2020-06-30 22:38:59 +00:00
|
|
|
ASSERT_EQ(memcpy_stats[i].num_compressed_buf_memcpy,
|
2020-04-24 22:30:12 +00:00
|
|
|
expected_stats.memcpy_stats.num_compressed_buf_memcpy);
|
|
|
|
|
Fix testcase failures on windows (#7992)
Summary:
Fixed 5 test case failures found on Windows 10/Windows Server 2016
1. In `flush_job_test`, the DestroyDir function fails in deconstructor because some file handles are still being held by VersionSet. This happens on Windows Server 2016, so need to manually reset versions_ pointer to release all file handles.
2. In `StatsHistoryTest.InMemoryStatsHistoryPurging` test, the capping memory cost of stats_history_size on Windows becomes 14000 bytes with latest changes, not just 13000 bytes.
3. In `SSTDumpToolTest.RawOutput` test, the output file handle is not closed at the end.
4. In `FullBloomTest.OptimizeForMemory` test, ROCKSDB_MALLOC_USABLE_SIZE is undefined on windows so `total_mem` is always equal to `total_size`. The internal memory fragmentation assertion does not apply in this case.
5. In `BlockFetcherTest.FetchAndUncompressCompressedDataBlock` test, XPRESS cannot reach 87.5% compression ratio with original CreateTable method, so I append extra zeros to the string value to enhance compression ratio. Beside, since XPRESS allocates memory internally, thus does not support for custom allocator verification, we will skip the allocator verification for XPRESS
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7992
Reviewed By: jay-zhuang
Differential Revision: D26615283
Pulled By: ajkr
fbshipit-source-id: 3632612f84b99e2b9c77c403b112b6bedf3b125d
2021-02-23 22:31:50 +00:00
|
|
|
if (kXpressCompression == compression_type) {
|
|
|
|
// XPRESS allocates memory internally, thus does not support for
|
|
|
|
// custom allocator verification
|
|
|
|
continue;
|
|
|
|
} else {
|
|
|
|
ASSERT_EQ(
|
|
|
|
heap_buf_allocators[i].GetNumAllocations(),
|
|
|
|
expected_stats.buf_allocation_stats.num_heap_buf_allocations);
|
|
|
|
ASSERT_EQ(compressed_buf_allocators[i].GetNumAllocations(),
|
|
|
|
expected_stats.buf_allocation_stats
|
|
|
|
.num_compressed_buf_allocations);
|
|
|
|
|
|
|
|
// The allocated buffers are not deallocated until
|
|
|
|
// the block content is deleted.
|
|
|
|
ASSERT_EQ(heap_buf_allocators[i].GetNumDeallocations(), 0);
|
|
|
|
ASSERT_EQ(compressed_buf_allocators[i].GetNumDeallocations(), 0);
|
|
|
|
blocks[i].allocation.reset();
|
|
|
|
ASSERT_EQ(
|
|
|
|
heap_buf_allocators[i].GetNumDeallocations(),
|
|
|
|
expected_stats.buf_allocation_stats.num_heap_buf_allocations);
|
|
|
|
ASSERT_EQ(compressed_buf_allocators[i].GetNumDeallocations(),
|
|
|
|
expected_stats.buf_allocation_stats
|
|
|
|
.num_compressed_buf_allocations);
|
|
|
|
}
|
2020-04-24 22:30:12 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-06-30 22:38:59 +00:00
|
|
|
void SetMode(Mode mode) {
|
|
|
|
switch (mode) {
|
|
|
|
case Mode::kBufferedRead:
|
|
|
|
options_.use_direct_reads = false;
|
|
|
|
options_.allow_mmap_reads = false;
|
|
|
|
break;
|
|
|
|
case Mode::kBufferedMmap:
|
|
|
|
options_.use_direct_reads = false;
|
|
|
|
options_.allow_mmap_reads = true;
|
|
|
|
break;
|
|
|
|
case Mode::kDirectRead:
|
|
|
|
options_.use_direct_reads = true;
|
|
|
|
options_.allow_mmap_reads = false;
|
|
|
|
break;
|
|
|
|
case Mode::kNumModes:
|
|
|
|
assert(false);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-04-24 22:30:12 +00:00
|
|
|
private:
|
|
|
|
std::string test_dir_;
|
|
|
|
Env* env_;
|
|
|
|
std::shared_ptr<FileSystem> fs_;
|
|
|
|
BlockBasedTableFactory table_factory_;
|
2020-06-30 22:38:59 +00:00
|
|
|
Options options_;
|
2020-04-24 22:30:12 +00:00
|
|
|
|
|
|
|
std::string Path(const std::string& fname) { return test_dir_ + "/" + fname; }
|
|
|
|
|
|
|
|
void WriteToFile(const std::string& content, const std::string& filename) {
|
|
|
|
std::unique_ptr<FSWritableFile> f;
|
|
|
|
ASSERT_OK(fs_->NewWritableFile(Path(filename), FileOptions(), &f, nullptr));
|
|
|
|
ASSERT_OK(f->Append(content, IOOptions(), nullptr));
|
|
|
|
ASSERT_OK(f->Close(IOOptions(), nullptr));
|
|
|
|
}
|
|
|
|
|
|
|
|
void NewFileWriter(const std::string& filename,
|
|
|
|
std::unique_ptr<WritableFileWriter>* writer) {
|
|
|
|
std::string path = Path(filename);
|
2021-01-29 06:08:46 +00:00
|
|
|
FileOptions file_options;
|
|
|
|
ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), path,
|
|
|
|
file_options, writer, nullptr));
|
2020-04-24 22:30:12 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void NewFileReader(const std::string& filename, const FileOptions& opt,
|
|
|
|
std::unique_ptr<RandomAccessFileReader>* reader) {
|
|
|
|
std::string path = Path(filename);
|
|
|
|
std::unique_ptr<FSRandomAccessFile> f;
|
|
|
|
ASSERT_OK(fs_->NewRandomAccessFile(path, opt, &f, nullptr));
|
2021-03-15 11:32:24 +00:00
|
|
|
reader->reset(new RandomAccessFileReader(std::move(f), path,
|
|
|
|
env_->GetSystemClock().get()));
|
2020-04-24 22:30:12 +00:00
|
|
|
}
|
|
|
|
|
2021-05-05 20:59:21 +00:00
|
|
|
void NewTableReader(const ImmutableOptions& ioptions,
|
2020-04-24 22:30:12 +00:00
|
|
|
const FileOptions& foptions,
|
|
|
|
const InternalKeyComparator& comparator,
|
|
|
|
const std::string& table_name,
|
|
|
|
std::unique_ptr<BlockBasedTable>* table) {
|
|
|
|
std::unique_ptr<RandomAccessFileReader> file;
|
|
|
|
NewFileReader(table_name, foptions, &file);
|
|
|
|
|
|
|
|
uint64_t file_size = 0;
|
|
|
|
ASSERT_OK(env_->GetFileSize(Path(table_name), &file_size));
|
|
|
|
|
|
|
|
std::unique_ptr<TableReader> table_reader;
|
2020-06-29 21:51:57 +00:00
|
|
|
ReadOptions ro;
|
2020-09-14 23:59:00 +00:00
|
|
|
const auto* table_options =
|
|
|
|
table_factory_.GetOptions<BlockBasedTableOptions>();
|
|
|
|
ASSERT_NE(table_options, nullptr);
|
Record and use the tail size to prefetch table tail (#11406)
Summary:
**Context:**
We prefetch the tail part of a SST file (i.e, the blocks after data blocks till the end of the file) during each SST file open in hope to prefetch all the stuff at once ahead of time for later read e.g, footer, meta index, filter/index etc. The existing approach to estimate the tail size to prefetch is through `TailPrefetchStats` heuristics introduced in https://github.com/facebook/rocksdb/pull/4156, which has caused small reads in unlucky case (e.g, small read into the tail buffer during table open in thread 1 under the same BlockBasedTableFactory object can make thread 2's tail prefetching use a small size that it shouldn't) and is hard to debug. Therefore we decide to record the exact tail size and use it directly to prefetch tail of the SST instead of relying heuristics.
**Summary:**
- Obtain and record in manifest the tail size in `BlockBasedTableBuilder::Finish()`
- For backward compatibility, we fall back to TailPrefetchStats and last to simple heuristics that the tail size is a linear portion of the file size - see PR conversation for more.
- Make`tail_start_offset` part of the table properties and deduct tail size to record in manifest for external files (e.g, file ingestion, import CF) and db repair (with no access to manifest).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11406
Test Plan:
1. New UT
2. db bench
Note: db bench on /tmp/ where direct read is supported is too slow to finish and the default pinning setting in db bench is not helpful to profile # sst read of Get. Therefore I hacked the following to obtain the following comparison.
```
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index bd5669f0f..791484c1f 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -838,7 +838,7 @@ Status BlockBasedTable::PrefetchTail(
&tail_prefetch_size);
// Try file system prefetch
- if (!file->use_direct_io() && !force_direct_prefetch) {
+ if (false && !file->use_direct_io() && !force_direct_prefetch) {
if (!file->Prefetch(prefetch_off, prefetch_len, ro.rate_limiter_priority)
.IsNotSupported()) {
prefetch_buffer->reset(new FilePrefetchBuffer(
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index ea40f5fa0..39a0ac385 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -4191,6 +4191,8 @@ class Benchmark {
std::shared_ptr<TableFactory>(NewCuckooTableFactory(table_options));
} else {
BlockBasedTableOptions block_based_options;
+ block_based_options.metadata_cache_options.partition_pinning =
+ PinningTier::kAll;
block_based_options.checksum =
static_cast<ChecksumType>(FLAGS_checksum_type);
if (FLAGS_use_hash_search) {
```
Create DB
```
./db_bench --bloom_bits=3 --use_existing_db=1 --seed=1682546046158958 --partition_index_and_filters=1 --statistics=1 -db=/dev/shm/testdb/ -benchmarks=readrandom -key_size=3200 -value_size=512 -num=1000000 -write_buffer_size=6550000 -disable_auto_compactions=false -target_file_size_base=6550000 -compression_type=none
```
ReadRandom
```
./db_bench --bloom_bits=3 --use_existing_db=1 --seed=1682546046158958 --partition_index_and_filters=1 --statistics=1 -db=/dev/shm/testdb/ -benchmarks=readrandom -key_size=3200 -value_size=512 -num=1000000 -write_buffer_size=6550000 -disable_auto_compactions=false -target_file_size_base=6550000 -compression_type=none
```
(a) Existing (Use TailPrefetchStats for tail size + use seperate prefetch buffer in PartitionedFilter/IndexReader::CacheDependencies())
```
rocksdb.table.open.prefetch.tail.hit COUNT : 3395
rocksdb.sst.read.micros P50 : 5.655570 P95 : 9.931396 P99 : 14.845454 P100 : 585.000000 COUNT : 999905 SUM : 6590614
```
(b) This PR (Record tail size + use the same tail buffer in PartitionedFilter/IndexReader::CacheDependencies())
```
rocksdb.table.open.prefetch.tail.hit COUNT : 14257
rocksdb.sst.read.micros P50 : 5.173347 P95 : 9.015017 P99 : 12.912610 P100 : 228.000000 COUNT : 998547 SUM : 5976540
```
As we can see, we increase the prefetch tail hit count and decrease SST read count with this PR
3. Test backward compatibility by stepping through reading with post-PR code on a db generated pre-PR.
Reviewed By: pdillinger
Differential Revision: D45413346
Pulled By: hx235
fbshipit-source-id: 7d5e36a60a72477218f79905168d688452a4c064
2023-05-08 20:14:28 +00:00
|
|
|
ASSERT_OK(BlockBasedTable::Open(ro, ioptions, EnvOptions(), *table_options,
|
|
|
|
comparator, std::move(file), file_size,
|
|
|
|
0 /* block_protection_bytes_per_key */,
|
|
|
|
&table_reader, 0 /* tail_size */));
|
2020-04-24 22:30:12 +00:00
|
|
|
|
|
|
|
table->reset(reinterpret_cast<BlockBasedTable*>(table_reader.release()));
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string ToInternalKey(const std::string& key) {
|
|
|
|
InternalKey internal_key(key, 0, ValueType::kTypeValue);
|
|
|
|
return internal_key.Encode().ToString();
|
|
|
|
}
|
|
|
|
|
|
|
|
void ReadFooter(RandomAccessFileReader* file, Footer* footer) {
|
|
|
|
uint64_t file_size = 0;
|
|
|
|
ASSERT_OK(env_->GetFileSize(file->file_name(), &file_size));
|
2020-06-29 21:51:57 +00:00
|
|
|
IOOptions opts;
|
2022-12-09 18:03:47 +00:00
|
|
|
ASSERT_OK(ReadFooterFromFile(opts, file, *fs_,
|
|
|
|
nullptr /* prefetch_buffer */, file_size,
|
|
|
|
footer, kBlockBasedTableMagicNumber));
|
2020-04-24 22:30:12 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// NOTE: compression_type returns the compression type of the fetched block
|
|
|
|
// contents, so if the block is fetched and uncompressed, then it's
|
|
|
|
// kNoCompression.
|
|
|
|
void FetchBlock(RandomAccessFileReader* file, const BlockHandle& block,
|
|
|
|
BlockType block_type, bool compressed, bool do_uncompress,
|
|
|
|
MemoryAllocator* heap_buf_allocator,
|
|
|
|
MemoryAllocator* compressed_buf_allocator,
|
|
|
|
BlockContents* contents, MemcpyStats* stats,
|
Support compressed and local flash secondary cache stacking (#11812)
Summary:
This PR implements support for a three tier cache - primary block cache, compressed secondary cache, and a nvm (local flash) secondary cache. This allows more effective utilization of the nvm cache, and minimizes the number of reads from local flash by caching compressed blocks in the compressed secondary cache.
The basic design is as follows -
1. A new secondary cache implementation, ```TieredSecondaryCache```, is introduced. It keeps the compressed and nvm secondary caches and manages the movement of blocks between them and the primary block cache. To setup a three tier cache, we allocate a ```CacheWithSecondaryAdapter```, with a ```TieredSecondaryCache``` instance as the secondary cache.
2. The table reader passes both the uncompressed and compressed block to ```FullTypedCacheInterface::InsertFull```, allowing the block cache to optionally store the compressed block.
3. When there's a miss, the block object is constructed and inserted in the primary cache, and the compressed block is inserted into the nvm cache by calling ```InsertSaved```. This avoids the overhead of recompressing the block, as well as avoiding putting more memory pressure on the compressed secondary cache.
4. When there's a hit in the nvm cache, we attempt to insert the block in the compressed secondary cache and the primary cache, subject to the admission policy of those caches (i.e admit on second access). Blocks/items evicted from any tier are simply discarded.
We can easily implement additional admission policies if desired.
Todo (In a subsequent PR):
1. Add to db_bench and run benchmarks
2. Add to db_stress
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11812
Reviewed By: pdillinger
Differential Revision: D49461842
Pulled By: anand1976
fbshipit-source-id: b40ac1330ef7cd8c12efa0a3ca75128e602e3a0b
2023-09-22 03:30:53 +00:00
|
|
|
CompressionType* compression_type) {
|
2021-05-05 20:59:21 +00:00
|
|
|
ImmutableOptions ioptions(options_);
|
2020-04-24 22:30:12 +00:00
|
|
|
ReadOptions roptions;
|
|
|
|
PersistentCacheOptions persistent_cache_options;
|
|
|
|
Footer footer;
|
|
|
|
ReadFooter(file, &footer);
|
|
|
|
std::unique_ptr<BlockFetcher> fetcher(new BlockFetcher(
|
|
|
|
file, nullptr /* prefetch_buffer */, footer, roptions, block, contents,
|
|
|
|
ioptions, do_uncompress, compressed, block_type,
|
|
|
|
UncompressionDict::GetEmptyDict(), persistent_cache_options,
|
|
|
|
heap_buf_allocator, compressed_buf_allocator));
|
|
|
|
|
|
|
|
ASSERT_OK(fetcher->ReadBlockContents());
|
|
|
|
|
|
|
|
stats->num_stack_buf_memcpy = fetcher->TEST_GetNumStackBufMemcpy();
|
|
|
|
stats->num_heap_buf_memcpy = fetcher->TEST_GetNumHeapBufMemcpy();
|
|
|
|
stats->num_compressed_buf_memcpy =
|
|
|
|
fetcher->TEST_GetNumCompressedBufMemcpy();
|
|
|
|
|
Support compressed and local flash secondary cache stacking (#11812)
Summary:
This PR implements support for a three tier cache - primary block cache, compressed secondary cache, and a nvm (local flash) secondary cache. This allows more effective utilization of the nvm cache, and minimizes the number of reads from local flash by caching compressed blocks in the compressed secondary cache.
The basic design is as follows -
1. A new secondary cache implementation, ```TieredSecondaryCache```, is introduced. It keeps the compressed and nvm secondary caches and manages the movement of blocks between them and the primary block cache. To setup a three tier cache, we allocate a ```CacheWithSecondaryAdapter```, with a ```TieredSecondaryCache``` instance as the secondary cache.
2. The table reader passes both the uncompressed and compressed block to ```FullTypedCacheInterface::InsertFull```, allowing the block cache to optionally store the compressed block.
3. When there's a miss, the block object is constructed and inserted in the primary cache, and the compressed block is inserted into the nvm cache by calling ```InsertSaved```. This avoids the overhead of recompressing the block, as well as avoiding putting more memory pressure on the compressed secondary cache.
4. When there's a hit in the nvm cache, we attempt to insert the block in the compressed secondary cache and the primary cache, subject to the admission policy of those caches (i.e admit on second access). Blocks/items evicted from any tier are simply discarded.
We can easily implement additional admission policies if desired.
Todo (In a subsequent PR):
1. Add to db_bench and run benchmarks
2. Add to db_stress
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11812
Reviewed By: pdillinger
Differential Revision: D49461842
Pulled By: anand1976
fbshipit-source-id: b40ac1330ef7cd8c12efa0a3ca75128e602e3a0b
2023-09-22 03:30:53 +00:00
|
|
|
if (do_uncompress) {
|
|
|
|
*compression_type = kNoCompression;
|
|
|
|
} else {
|
|
|
|
*compression_type = fetcher->get_compression_type();
|
|
|
|
}
|
2020-04-24 22:30:12 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// NOTE: expected_compression_type is the expected compression
|
|
|
|
// type of the fetched block content, if the block is uncompressed,
|
|
|
|
// then the expected compression type is kNoCompression.
|
2020-06-30 22:38:59 +00:00
|
|
|
void FetchFirstDataBlock(const std::string& table_name, bool compressed,
|
|
|
|
bool do_uncompress,
|
2020-04-24 22:30:12 +00:00
|
|
|
CompressionType expected_compression_type,
|
|
|
|
MemoryAllocator* heap_buf_allocator,
|
|
|
|
MemoryAllocator* compressed_buf_allocator,
|
2020-06-30 22:38:59 +00:00
|
|
|
BlockContents* block, std::string* result,
|
|
|
|
MemcpyStats* memcpy_stats) {
|
2021-05-05 20:59:21 +00:00
|
|
|
ImmutableOptions ioptions(options_);
|
2020-06-30 22:38:59 +00:00
|
|
|
InternalKeyComparator comparator(options_.comparator);
|
|
|
|
FileOptions foptions(options_);
|
2020-04-24 22:30:12 +00:00
|
|
|
|
|
|
|
// Get block handle for the first data block.
|
|
|
|
std::unique_ptr<BlockBasedTable> table;
|
|
|
|
NewTableReader(ioptions, foptions, comparator, table_name, &table);
|
|
|
|
|
|
|
|
std::unique_ptr<BlockBasedTable::IndexReader> index_reader;
|
2020-06-29 21:51:57 +00:00
|
|
|
ReadOptions ro;
|
2020-04-24 22:30:12 +00:00
|
|
|
ASSERT_OK(BinarySearchIndexReader::Create(
|
2020-06-29 21:51:57 +00:00
|
|
|
table.get(), ro, nullptr /* prefetch_buffer */, false /* use_cache */,
|
2020-04-24 22:30:12 +00:00
|
|
|
false /* prefetch */, false /* pin */, nullptr /* lookup_context */,
|
|
|
|
&index_reader));
|
|
|
|
|
|
|
|
std::unique_ptr<InternalIteratorBase<IndexValue>> iter(
|
|
|
|
index_reader->NewIterator(
|
|
|
|
ReadOptions(), false /* disable_prefix_seek */, nullptr /* iter */,
|
|
|
|
nullptr /* get_context */, nullptr /* lookup_context */));
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
iter->SeekToFirst();
|
|
|
|
BlockHandle first_block_handle = iter->value().handle;
|
|
|
|
|
|
|
|
// Fetch first data block.
|
|
|
|
std::unique_ptr<RandomAccessFileReader> file;
|
|
|
|
NewFileReader(table_name, foptions, &file);
|
|
|
|
CompressionType compression_type;
|
|
|
|
FetchBlock(file.get(), first_block_handle, BlockType::kData, compressed,
|
|
|
|
do_uncompress, heap_buf_allocator, compressed_buf_allocator,
|
|
|
|
block, memcpy_stats, &compression_type);
|
|
|
|
ASSERT_EQ(compression_type, expected_compression_type);
|
2020-06-30 22:38:59 +00:00
|
|
|
result->assign(block->data.ToString());
|
2020-04-24 22:30:12 +00:00
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2020-05-21 20:55:18 +00:00
|
|
|
// Skip the following tests in lite mode since direct I/O is unsupported.
|
|
|
|
|
2020-04-24 22:30:12 +00:00
|
|
|
// Fetch index block under both direct IO and non-direct IO.
|
|
|
|
// Expects:
|
|
|
|
// the index block contents are the same for both read modes.
|
|
|
|
TEST_F(BlockFetcherTest, FetchIndexBlock) {
|
|
|
|
for (CompressionType compression : GetSupportedCompressions()) {
|
|
|
|
std::string table_name =
|
|
|
|
"FetchIndexBlock" + CompressionTypeToString(compression);
|
|
|
|
CreateTable(table_name, compression);
|
|
|
|
|
|
|
|
CountedMemoryAllocator allocator;
|
|
|
|
MemcpyStats memcpy_stats;
|
2020-06-30 22:38:59 +00:00
|
|
|
BlockContents indexes[NumModes];
|
|
|
|
std::string index_datas[NumModes];
|
|
|
|
for (int i = 0; i < NumModes; ++i) {
|
|
|
|
SetMode(static_cast<Mode>(i));
|
|
|
|
FetchIndexBlock(table_name, &allocator, &allocator, &memcpy_stats,
|
|
|
|
&indexes[i], &index_datas[i]);
|
|
|
|
}
|
|
|
|
for (int i = 0; i < NumModes - 1; ++i) {
|
|
|
|
AssertSameBlock(index_datas[i], index_datas[i + 1]);
|
2020-04-24 22:30:12 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Data blocks are not compressed,
|
2020-06-30 22:38:59 +00:00
|
|
|
// fetch data block under direct IO, mmap IO,and non-direct IO.
|
2020-04-24 22:30:12 +00:00
|
|
|
// Expects:
|
|
|
|
// 1. in non-direct IO mode, allocate a heap buffer and memcpy the block
|
|
|
|
// into the buffer;
|
|
|
|
// 2. in direct IO mode, allocate a heap buffer and memcpy from the
|
|
|
|
// direct IO buffer to the heap buffer.
|
|
|
|
TEST_F(BlockFetcherTest, FetchUncompressedDataBlock) {
|
2020-06-30 22:38:59 +00:00
|
|
|
TestStats expected_non_mmap_stats = {
|
|
|
|
{
|
|
|
|
0 /* num_stack_buf_memcpy */,
|
|
|
|
1 /* num_heap_buf_memcpy */,
|
|
|
|
0 /* num_compressed_buf_memcpy */,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
1 /* num_heap_buf_allocations */,
|
|
|
|
0 /* num_compressed_buf_allocations */,
|
|
|
|
}};
|
|
|
|
TestStats expected_mmap_stats = {{
|
|
|
|
0 /* num_stack_buf_memcpy */,
|
|
|
|
0 /* num_heap_buf_memcpy */,
|
|
|
|
0 /* num_compressed_buf_memcpy */,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
0 /* num_heap_buf_allocations */,
|
|
|
|
0 /* num_compressed_buf_allocations */,
|
|
|
|
}};
|
|
|
|
std::array<TestStats, NumModes> expected_stats_by_mode{{
|
|
|
|
expected_non_mmap_stats /* kBufferedRead */,
|
|
|
|
expected_mmap_stats /* kBufferedMmap */,
|
|
|
|
expected_non_mmap_stats /* kDirectRead */,
|
|
|
|
}};
|
|
|
|
TestFetchDataBlock("FetchUncompressedDataBlock", false, false,
|
|
|
|
expected_stats_by_mode);
|
2020-04-24 22:30:12 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Data blocks are compressed,
|
|
|
|
// fetch data block under both direct IO and non-direct IO,
|
|
|
|
// but do not uncompress.
|
|
|
|
// Expects:
|
|
|
|
// 1. in non-direct IO mode, allocate a compressed buffer and memcpy the block
|
|
|
|
// into the buffer;
|
|
|
|
// 2. in direct IO mode, allocate a compressed buffer and memcpy from the
|
|
|
|
// direct IO buffer to the compressed buffer.
|
|
|
|
TEST_F(BlockFetcherTest, FetchCompressedDataBlock) {
|
2020-06-30 22:38:59 +00:00
|
|
|
TestStats expected_non_mmap_stats = {
|
|
|
|
{
|
|
|
|
0 /* num_stack_buf_memcpy */,
|
|
|
|
0 /* num_heap_buf_memcpy */,
|
|
|
|
1 /* num_compressed_buf_memcpy */,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
0 /* num_heap_buf_allocations */,
|
|
|
|
1 /* num_compressed_buf_allocations */,
|
|
|
|
}};
|
|
|
|
TestStats expected_mmap_stats = {{
|
|
|
|
0 /* num_stack_buf_memcpy */,
|
|
|
|
0 /* num_heap_buf_memcpy */,
|
|
|
|
0 /* num_compressed_buf_memcpy */,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
0 /* num_heap_buf_allocations */,
|
|
|
|
0 /* num_compressed_buf_allocations */,
|
|
|
|
}};
|
|
|
|
std::array<TestStats, NumModes> expected_stats_by_mode{{
|
|
|
|
expected_non_mmap_stats /* kBufferedRead */,
|
|
|
|
expected_mmap_stats /* kBufferedMmap */,
|
|
|
|
expected_non_mmap_stats /* kDirectRead */,
|
|
|
|
}};
|
|
|
|
TestFetchDataBlock("FetchCompressedDataBlock", true, false,
|
|
|
|
expected_stats_by_mode);
|
2020-04-24 22:30:12 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Data blocks are compressed,
|
|
|
|
// fetch and uncompress data block under both direct IO and non-direct IO.
|
|
|
|
// Expects:
|
|
|
|
// 1. in non-direct IO mode, since the block is small, so it's first memcpyed
|
|
|
|
// to the stack buffer, then a heap buffer is allocated and the block is
|
|
|
|
// uncompressed into the heap.
|
|
|
|
// 2. in direct IO mode mode, allocate a heap buffer, then directly uncompress
|
|
|
|
// and memcpy from the direct IO buffer to the heap buffer.
|
|
|
|
TEST_F(BlockFetcherTest, FetchAndUncompressCompressedDataBlock) {
|
2020-06-30 22:38:59 +00:00
|
|
|
TestStats expected_buffered_read_stats = {
|
|
|
|
{
|
|
|
|
1 /* num_stack_buf_memcpy */,
|
|
|
|
1 /* num_heap_buf_memcpy */,
|
|
|
|
0 /* num_compressed_buf_memcpy */,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
1 /* num_heap_buf_allocations */,
|
|
|
|
0 /* num_compressed_buf_allocations */,
|
|
|
|
}};
|
|
|
|
TestStats expected_mmap_stats = {{
|
|
|
|
0 /* num_stack_buf_memcpy */,
|
|
|
|
1 /* num_heap_buf_memcpy */,
|
|
|
|
0 /* num_compressed_buf_memcpy */,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
1 /* num_heap_buf_allocations */,
|
|
|
|
0 /* num_compressed_buf_allocations */,
|
|
|
|
}};
|
|
|
|
TestStats expected_direct_read_stats = {
|
|
|
|
{
|
|
|
|
0 /* num_stack_buf_memcpy */,
|
|
|
|
1 /* num_heap_buf_memcpy */,
|
|
|
|
0 /* num_compressed_buf_memcpy */,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
1 /* num_heap_buf_allocations */,
|
|
|
|
0 /* num_compressed_buf_allocations */,
|
|
|
|
}};
|
|
|
|
std::array<TestStats, NumModes> expected_stats_by_mode{{
|
|
|
|
expected_buffered_read_stats,
|
|
|
|
expected_mmap_stats,
|
|
|
|
expected_direct_read_stats,
|
|
|
|
}};
|
2020-04-24 22:30:12 +00:00
|
|
|
TestFetchDataBlock("FetchAndUncompressCompressedDataBlock", true, true,
|
2020-06-30 22:38:59 +00:00
|
|
|
expected_stats_by_mode);
|
2020-04-24 22:30:12 +00:00
|
|
|
}
|
|
|
|
|
2020-05-21 20:55:18 +00:00
|
|
|
|
2020-04-24 22:30:12 +00:00
|
|
|
} // namespace
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|
|
|
|
|
|
|
|
int main(int argc, char** argv) {
|
|
|
|
ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
|
|
|
|
::testing::InitGoogleTest(&argc, argv);
|
|
|
|
return RUN_ALL_TESTS();
|
|
|
|
}
|