2020-10-15 20:02:44 +00:00
|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
|
|
|
|
#include "db/blob/blob_file_cache.h"
|
|
|
|
|
|
|
|
#include <cassert>
|
|
|
|
#include <string>
|
|
|
|
|
|
|
|
#include "db/blob/blob_log_format.h"
|
|
|
|
#include "db/blob/blob_log_writer.h"
|
|
|
|
#include "env/mock_env.h"
|
|
|
|
#include "file/filename.h"
|
|
|
|
#include "file/read_write_util.h"
|
|
|
|
#include "file/writable_file_writer.h"
|
|
|
|
#include "options/cf_options.h"
|
|
|
|
#include "rocksdb/cache.h"
|
|
|
|
#include "rocksdb/env.h"
|
|
|
|
#include "rocksdb/file_system.h"
|
|
|
|
#include "rocksdb/options.h"
|
|
|
|
#include "rocksdb/statistics.h"
|
|
|
|
#include "test_util/sync_point.h"
|
|
|
|
#include "test_util/testharness.h"
|
|
|
|
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
|
|
|
|
// Creates a test blob file with a single blob in it.
|
|
|
|
void WriteBlobFile(uint32_t column_family_id,
|
2021-06-16 23:50:43 +00:00
|
|
|
const ImmutableOptions& immutable_options,
|
2020-10-15 20:02:44 +00:00
|
|
|
uint64_t blob_file_number) {
|
2021-06-16 23:50:43 +00:00
|
|
|
assert(!immutable_options.cf_paths.empty());
|
2020-10-15 20:02:44 +00:00
|
|
|
|
2021-06-16 23:50:43 +00:00
|
|
|
const std::string blob_file_path =
|
|
|
|
BlobFileName(immutable_options.cf_paths.front().path, blob_file_number);
|
2020-10-15 20:02:44 +00:00
|
|
|
|
|
|
|
std::unique_ptr<FSWritableFile> file;
|
2021-06-16 23:50:43 +00:00
|
|
|
ASSERT_OK(NewWritableFile(immutable_options.fs.get(), blob_file_path, &file,
|
|
|
|
FileOptions()));
|
2020-10-15 20:02:44 +00:00
|
|
|
|
2021-06-16 23:50:43 +00:00
|
|
|
std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
|
|
|
|
std::move(file), blob_file_path, FileOptions(), immutable_options.clock));
|
2020-10-15 20:02:44 +00:00
|
|
|
|
|
|
|
constexpr Statistics* statistics = nullptr;
|
|
|
|
constexpr bool use_fsync = false;
|
Do not explicitly flush blob files when using the integrated BlobDB (#7892)
Summary:
In the original stacked BlobDB implementation, which writes blobs to blob files
immediately and treats blob files as logs, it makes sense to flush the file after
writing each blob to protect against process crashes; however, in the integrated
implementation, which builds blob files in the background jobs, this unnecessarily
reduces performance. This patch fixes this by simply adding a `do_flush` flag to
`BlobLogWriter`, which is set to `true` by the stacked implementation and to `false`
by the new code. Note: the change itself is trivial but the tests needed some work;
since in the new implementation, blobs are now buffered, adding a blob to
`BlobFileBuilder` is no longer guaranteed to result in an actual I/O. Therefore, we can
no longer rely on `FaultInjectionTestEnv` when testing failure cases; instead, we
manipulate the return values of I/O methods directly using `SyncPoint`s.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7892
Test Plan: `make check`
Reviewed By: jay-zhuang
Differential Revision: D26022814
Pulled By: ltamasi
fbshipit-source-id: b3dce419f312137fa70d84cdd9b908fd5d60d8cd
2021-01-25 21:30:17 +00:00
|
|
|
constexpr bool do_flush = false;
|
2020-10-15 20:02:44 +00:00
|
|
|
|
2021-06-16 23:50:43 +00:00
|
|
|
BlobLogWriter blob_log_writer(std::move(file_writer), immutable_options.clock,
|
|
|
|
statistics, blob_file_number, use_fsync,
|
|
|
|
do_flush);
|
2020-10-15 20:02:44 +00:00
|
|
|
|
|
|
|
constexpr bool has_ttl = false;
|
|
|
|
constexpr ExpirationRange expiration_range;
|
|
|
|
|
|
|
|
BlobLogHeader header(column_family_id, kNoCompression, has_ttl,
|
|
|
|
expiration_range);
|
|
|
|
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
ASSERT_OK(blob_log_writer.WriteHeader(WriteOptions(), header));
|
2020-10-15 20:02:44 +00:00
|
|
|
|
|
|
|
constexpr char key[] = "key";
|
|
|
|
constexpr char blob[] = "blob";
|
|
|
|
|
|
|
|
std::string compressed_blob;
|
|
|
|
|
|
|
|
uint64_t key_offset = 0;
|
|
|
|
uint64_t blob_offset = 0;
|
|
|
|
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
ASSERT_OK(blob_log_writer.AddRecord(WriteOptions(), key, blob, &key_offset,
|
|
|
|
&blob_offset));
|
2020-10-15 20:02:44 +00:00
|
|
|
|
|
|
|
BlobLogFooter footer;
|
|
|
|
footer.blob_count = 1;
|
|
|
|
footer.expiration_range = expiration_range;
|
|
|
|
|
|
|
|
std::string checksum_method;
|
|
|
|
std::string checksum_value;
|
|
|
|
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
ASSERT_OK(blob_log_writer.AppendFooter(WriteOptions(), footer,
|
|
|
|
&checksum_method, &checksum_value));
|
2020-10-15 20:02:44 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
} // anonymous namespace
|
|
|
|
|
|
|
|
class BlobFileCacheTest : public testing::Test {
|
|
|
|
protected:
|
2021-09-21 15:53:03 +00:00
|
|
|
BlobFileCacheTest() { mock_env_.reset(MockEnv::Create(Env::Default())); }
|
2020-10-15 20:02:44 +00:00
|
|
|
|
2021-09-21 15:53:03 +00:00
|
|
|
std::unique_ptr<Env> mock_env_;
|
2020-10-15 20:02:44 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
TEST_F(BlobFileCacheTest, GetBlobFileReader) {
|
|
|
|
Options options;
|
2021-09-21 15:53:03 +00:00
|
|
|
options.env = mock_env_.get();
|
2020-10-15 20:02:44 +00:00
|
|
|
options.statistics = CreateDBStatistics();
|
|
|
|
options.cf_paths.emplace_back(
|
2021-09-21 15:53:03 +00:00
|
|
|
test::PerThreadDBPath(mock_env_.get(),
|
|
|
|
"BlobFileCacheTest_GetBlobFileReader"),
|
2020-10-15 20:02:44 +00:00
|
|
|
0);
|
|
|
|
options.enable_blob_files = true;
|
|
|
|
|
|
|
|
constexpr uint32_t column_family_id = 1;
|
2021-06-16 23:50:43 +00:00
|
|
|
ImmutableOptions immutable_options(options);
|
2020-10-15 20:02:44 +00:00
|
|
|
constexpr uint64_t blob_file_number = 123;
|
|
|
|
|
2021-06-16 23:50:43 +00:00
|
|
|
WriteBlobFile(column_family_id, immutable_options, blob_file_number);
|
2020-10-15 20:02:44 +00:00
|
|
|
|
|
|
|
constexpr size_t capacity = 10;
|
|
|
|
std::shared_ptr<Cache> backing_cache = NewLRUCache(capacity);
|
|
|
|
|
|
|
|
FileOptions file_options;
|
|
|
|
constexpr HistogramImpl* blob_file_read_hist = nullptr;
|
|
|
|
|
2021-06-16 23:50:43 +00:00
|
|
|
BlobFileCache blob_file_cache(backing_cache.get(), &immutable_options,
|
2020-10-15 20:02:44 +00:00
|
|
|
&file_options, column_family_id,
|
2021-02-16 17:47:12 +00:00
|
|
|
blob_file_read_hist, nullptr /*IOTracer*/);
|
2020-10-15 20:02:44 +00:00
|
|
|
|
|
|
|
// First try: reader should be opened and put in cache
|
|
|
|
CacheHandleGuard<BlobFileReader> first;
|
|
|
|
|
2023-04-21 16:07:18 +00:00
|
|
|
const ReadOptions read_options;
|
|
|
|
ASSERT_OK(blob_file_cache.GetBlobFileReader(read_options, blob_file_number,
|
|
|
|
&first));
|
2020-10-15 20:02:44 +00:00
|
|
|
ASSERT_NE(first.GetValue(), nullptr);
|
|
|
|
ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1);
|
|
|
|
ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0);
|
|
|
|
|
|
|
|
// Second try: reader should be served from cache
|
|
|
|
CacheHandleGuard<BlobFileReader> second;
|
|
|
|
|
2023-04-21 16:07:18 +00:00
|
|
|
ASSERT_OK(blob_file_cache.GetBlobFileReader(read_options, blob_file_number,
|
|
|
|
&second));
|
2020-10-15 20:02:44 +00:00
|
|
|
ASSERT_NE(second.GetValue(), nullptr);
|
|
|
|
ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1);
|
|
|
|
ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0);
|
|
|
|
|
|
|
|
ASSERT_EQ(first.GetValue(), second.GetValue());
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(BlobFileCacheTest, GetBlobFileReader_Race) {
|
|
|
|
Options options;
|
2021-09-21 15:53:03 +00:00
|
|
|
options.env = mock_env_.get();
|
2020-10-15 20:02:44 +00:00
|
|
|
options.statistics = CreateDBStatistics();
|
|
|
|
options.cf_paths.emplace_back(
|
2021-09-21 15:53:03 +00:00
|
|
|
test::PerThreadDBPath(mock_env_.get(),
|
2020-10-15 20:02:44 +00:00
|
|
|
"BlobFileCacheTest_GetBlobFileReader_Race"),
|
|
|
|
0);
|
|
|
|
options.enable_blob_files = true;
|
|
|
|
|
|
|
|
constexpr uint32_t column_family_id = 1;
|
2021-06-16 23:50:43 +00:00
|
|
|
ImmutableOptions immutable_options(options);
|
2020-10-15 20:02:44 +00:00
|
|
|
constexpr uint64_t blob_file_number = 123;
|
|
|
|
|
2021-06-16 23:50:43 +00:00
|
|
|
WriteBlobFile(column_family_id, immutable_options, blob_file_number);
|
2020-10-15 20:02:44 +00:00
|
|
|
|
|
|
|
constexpr size_t capacity = 10;
|
|
|
|
std::shared_ptr<Cache> backing_cache = NewLRUCache(capacity);
|
|
|
|
|
|
|
|
FileOptions file_options;
|
|
|
|
constexpr HistogramImpl* blob_file_read_hist = nullptr;
|
|
|
|
|
2021-06-16 23:50:43 +00:00
|
|
|
BlobFileCache blob_file_cache(backing_cache.get(), &immutable_options,
|
2020-10-15 20:02:44 +00:00
|
|
|
&file_options, column_family_id,
|
2021-02-16 17:47:12 +00:00
|
|
|
blob_file_read_hist, nullptr /*IOTracer*/);
|
2020-10-15 20:02:44 +00:00
|
|
|
|
|
|
|
CacheHandleGuard<BlobFileReader> first;
|
|
|
|
CacheHandleGuard<BlobFileReader> second;
|
|
|
|
|
2023-04-21 16:07:18 +00:00
|
|
|
const ReadOptions read_options;
|
2020-10-15 20:02:44 +00:00
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"BlobFileCache::GetBlobFileReader:DoubleCheck", [&](void* /* arg */) {
|
|
|
|
// Disabling sync points to prevent infinite recursion
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
2023-04-21 16:07:18 +00:00
|
|
|
ASSERT_OK(blob_file_cache.GetBlobFileReader(read_options,
|
|
|
|
blob_file_number, &second));
|
2020-10-15 20:02:44 +00:00
|
|
|
ASSERT_NE(second.GetValue(), nullptr);
|
|
|
|
ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1);
|
|
|
|
ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0);
|
|
|
|
});
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
2023-04-21 16:07:18 +00:00
|
|
|
ASSERT_OK(blob_file_cache.GetBlobFileReader(read_options, blob_file_number,
|
|
|
|
&first));
|
2020-10-15 20:02:44 +00:00
|
|
|
ASSERT_NE(first.GetValue(), nullptr);
|
|
|
|
ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1);
|
|
|
|
ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0);
|
|
|
|
|
|
|
|
ASSERT_EQ(first.GetValue(), second.GetValue());
|
|
|
|
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
SyncPoint::GetInstance()->ClearAllCallBacks();
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(BlobFileCacheTest, GetBlobFileReader_IOError) {
|
|
|
|
Options options;
|
2021-09-21 15:53:03 +00:00
|
|
|
options.env = mock_env_.get();
|
2020-10-15 20:02:44 +00:00
|
|
|
options.statistics = CreateDBStatistics();
|
|
|
|
options.cf_paths.emplace_back(
|
2021-09-21 15:53:03 +00:00
|
|
|
test::PerThreadDBPath(mock_env_.get(),
|
2020-10-15 20:02:44 +00:00
|
|
|
"BlobFileCacheTest_GetBlobFileReader_IOError"),
|
|
|
|
0);
|
|
|
|
options.enable_blob_files = true;
|
|
|
|
|
|
|
|
constexpr size_t capacity = 10;
|
|
|
|
std::shared_ptr<Cache> backing_cache = NewLRUCache(capacity);
|
|
|
|
|
2021-06-16 23:50:43 +00:00
|
|
|
ImmutableOptions immutable_options(options);
|
2020-10-15 20:02:44 +00:00
|
|
|
FileOptions file_options;
|
|
|
|
constexpr uint32_t column_family_id = 1;
|
|
|
|
constexpr HistogramImpl* blob_file_read_hist = nullptr;
|
|
|
|
|
2021-06-16 23:50:43 +00:00
|
|
|
BlobFileCache blob_file_cache(backing_cache.get(), &immutable_options,
|
2020-10-15 20:02:44 +00:00
|
|
|
&file_options, column_family_id,
|
2021-02-16 17:47:12 +00:00
|
|
|
blob_file_read_hist, nullptr /*IOTracer*/);
|
2020-10-15 20:02:44 +00:00
|
|
|
|
|
|
|
// Note: there is no blob file with the below number
|
|
|
|
constexpr uint64_t blob_file_number = 123;
|
|
|
|
|
|
|
|
CacheHandleGuard<BlobFileReader> reader;
|
|
|
|
|
2023-04-21 16:07:18 +00:00
|
|
|
const ReadOptions read_options;
|
2020-10-15 20:02:44 +00:00
|
|
|
ASSERT_TRUE(
|
2023-04-21 16:07:18 +00:00
|
|
|
blob_file_cache.GetBlobFileReader(read_options, blob_file_number, &reader)
|
|
|
|
.IsIOError());
|
2020-10-15 20:02:44 +00:00
|
|
|
ASSERT_EQ(reader.GetValue(), nullptr);
|
|
|
|
ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1);
|
|
|
|
ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 1);
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(BlobFileCacheTest, GetBlobFileReader_CacheFull) {
|
|
|
|
Options options;
|
2021-09-21 15:53:03 +00:00
|
|
|
options.env = mock_env_.get();
|
2020-10-15 20:02:44 +00:00
|
|
|
options.statistics = CreateDBStatistics();
|
|
|
|
options.cf_paths.emplace_back(
|
2021-09-21 15:53:03 +00:00
|
|
|
test::PerThreadDBPath(mock_env_.get(),
|
2020-10-15 20:02:44 +00:00
|
|
|
"BlobFileCacheTest_GetBlobFileReader_CacheFull"),
|
|
|
|
0);
|
|
|
|
options.enable_blob_files = true;
|
|
|
|
|
|
|
|
constexpr uint32_t column_family_id = 1;
|
2021-06-16 23:50:43 +00:00
|
|
|
ImmutableOptions immutable_options(options);
|
2020-10-15 20:02:44 +00:00
|
|
|
constexpr uint64_t blob_file_number = 123;
|
|
|
|
|
2021-06-16 23:50:43 +00:00
|
|
|
WriteBlobFile(column_family_id, immutable_options, blob_file_number);
|
2020-10-15 20:02:44 +00:00
|
|
|
|
|
|
|
constexpr size_t capacity = 0;
|
|
|
|
constexpr int num_shard_bits = -1; // determined automatically
|
|
|
|
constexpr bool strict_capacity_limit = true;
|
|
|
|
std::shared_ptr<Cache> backing_cache =
|
|
|
|
NewLRUCache(capacity, num_shard_bits, strict_capacity_limit);
|
|
|
|
|
|
|
|
FileOptions file_options;
|
|
|
|
constexpr HistogramImpl* blob_file_read_hist = nullptr;
|
|
|
|
|
2021-06-16 23:50:43 +00:00
|
|
|
BlobFileCache blob_file_cache(backing_cache.get(), &immutable_options,
|
2020-10-15 20:02:44 +00:00
|
|
|
&file_options, column_family_id,
|
2021-02-16 17:47:12 +00:00
|
|
|
blob_file_read_hist, nullptr /*IOTracer*/);
|
2020-10-15 20:02:44 +00:00
|
|
|
|
|
|
|
// Insert into cache should fail since it has zero capacity and
|
|
|
|
// strict_capacity_limit is set
|
|
|
|
CacheHandleGuard<BlobFileReader> reader;
|
|
|
|
|
2023-04-21 16:07:18 +00:00
|
|
|
const ReadOptions read_options;
|
|
|
|
ASSERT_TRUE(
|
|
|
|
blob_file_cache.GetBlobFileReader(read_options, blob_file_number, &reader)
|
|
|
|
.IsMemoryLimit());
|
2020-10-15 20:02:44 +00:00
|
|
|
ASSERT_EQ(reader.GetValue(), nullptr);
|
|
|
|
ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1);
|
|
|
|
ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 1);
|
|
|
|
}
|
|
|
|
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|
|
|
|
|
|
|
|
int main(int argc, char** argv) {
|
2022-10-18 07:35:35 +00:00
|
|
|
ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
|
2020-10-15 20:02:44 +00:00
|
|
|
::testing::InitGoogleTest(&argc, argv);
|
|
|
|
return RUN_ALL_TESTS();
|
|
|
|
}
|