2016-02-09 23:12:00 +00:00
|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
2017-07-15 23:03:42 +00:00
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
2015-09-23 19:42:43 +00:00
|
|
|
|
|
|
|
#include "rocksdb/sst_file_writer.h"
|
|
|
|
|
|
|
|
#include <vector>
|
2019-05-30 21:47:29 +00:00
|
|
|
|
2021-08-18 18:32:00 +00:00
|
|
|
#include "db/db_impl/db_impl.h"
|
2015-09-23 19:42:43 +00:00
|
|
|
#include "db/dbformat.h"
|
2023-08-11 01:16:10 +00:00
|
|
|
#include "db/wide/wide_column_serialization.h"
|
2023-09-12 19:36:07 +00:00
|
|
|
#include "db/wide/wide_columns_helper.h"
|
2019-09-16 17:31:27 +00:00
|
|
|
#include "file/writable_file_writer.h"
|
2021-01-29 06:08:46 +00:00
|
|
|
#include "rocksdb/file_system.h"
|
2015-09-23 19:42:43 +00:00
|
|
|
#include "rocksdb/table.h"
|
2019-05-30 21:47:29 +00:00
|
|
|
#include "table/block_based/block_based_table_builder.h"
|
2016-10-18 23:59:37 +00:00
|
|
|
#include "table/sst_file_writer_collectors.h"
|
2019-05-30 18:21:38 +00:00
|
|
|
#include "test_util/sync_point.h"
|
2015-09-23 19:42:43 +00:00
|
|
|
|
2020-02-20 20:07:53 +00:00
|
|
|
namespace ROCKSDB_NAMESPACE {
|
2015-09-23 19:42:43 +00:00
|
|
|
|
|
|
|
const std::string ExternalSstFilePropertyNames::kVersion =
|
|
|
|
"rocksdb.external_sst_file.version";
|
2016-10-18 23:59:37 +00:00
|
|
|
const std::string ExternalSstFilePropertyNames::kGlobalSeqno =
|
|
|
|
"rocksdb.external_sst_file.global_seqno";
|
2017-02-28 19:05:08 +00:00
|
|
|
|
|
|
|
|
2022-10-25 18:50:38 +00:00
|
|
|
const size_t kFadviseTrigger = 1024 * 1024; // 1MB
|
2015-09-23 19:42:43 +00:00
|
|
|
|
|
|
|
struct SstFileWriter::Rep {
|
2016-05-17 20:11:56 +00:00
|
|
|
Rep(const EnvOptions& _env_options, const Options& options,
|
2017-05-23 18:36:06 +00:00
|
|
|
Env::IOPriority _io_priority, const Comparator* _user_comparator,
|
2021-08-21 03:39:52 +00:00
|
|
|
ColumnFamilyHandle* _cfh, bool _invalidate_page_cache, bool _skip_filters,
|
|
|
|
std::string _db_session_id)
|
2015-09-23 19:42:43 +00:00
|
|
|
: env_options(_env_options),
|
2016-05-17 20:11:56 +00:00
|
|
|
ioptions(options),
|
2016-09-14 04:11:59 +00:00
|
|
|
mutable_cf_options(options),
|
2017-05-23 18:36:06 +00:00
|
|
|
io_priority(_io_priority),
|
2016-12-05 22:16:23 +00:00
|
|
|
internal_comparator(_user_comparator),
|
2016-12-29 22:51:52 +00:00
|
|
|
cfh(_cfh),
|
|
|
|
invalidate_page_cache(_invalidate_page_cache),
|
2021-08-21 03:39:52 +00:00
|
|
|
skip_filters(_skip_filters),
|
2024-02-14 04:30:07 +00:00
|
|
|
db_session_id(_db_session_id),
|
|
|
|
ts_sz(_user_comparator->timestamp_size()),
|
|
|
|
strip_timestamp(ts_sz > 0 &&
|
|
|
|
!ioptions.persist_user_defined_timestamps) {
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
// TODO (hx235): pass in `WriteOptions` instead of `rate_limiter_priority`
|
|
|
|
// during construction
|
|
|
|
write_options.rate_limiter_priority = io_priority;
|
|
|
|
}
|
2015-09-23 19:42:43 +00:00
|
|
|
|
|
|
|
std::unique_ptr<WritableFileWriter> file_writer;
|
|
|
|
std::unique_ptr<TableBuilder> builder;
|
|
|
|
EnvOptions env_options;
|
2021-05-05 20:59:21 +00:00
|
|
|
ImmutableOptions ioptions;
|
2016-05-17 20:11:56 +00:00
|
|
|
MutableCFOptions mutable_cf_options;
|
2017-05-23 18:36:06 +00:00
|
|
|
Env::IOPriority io_priority;
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
WriteOptions write_options;
|
2015-09-23 19:42:43 +00:00
|
|
|
InternalKeyComparator internal_comparator;
|
|
|
|
ExternalSstFileInfo file_info;
|
2016-06-13 16:57:43 +00:00
|
|
|
InternalKey ikey;
|
2016-12-05 22:16:23 +00:00
|
|
|
std::string column_family_name;
|
|
|
|
ColumnFamilyHandle* cfh;
|
2016-12-29 22:51:52 +00:00
|
|
|
// If true, We will give the OS a hint that this file pages is not needed
|
2018-03-08 18:18:34 +00:00
|
|
|
// every time we write 1MB to the file.
|
2016-12-29 22:51:52 +00:00
|
|
|
bool invalidate_page_cache;
|
2017-05-23 18:36:06 +00:00
|
|
|
// The size of the file during the last time we called Fadvise to remove
|
2016-12-29 22:51:52 +00:00
|
|
|
// cached pages from page cache.
|
2021-08-21 03:39:52 +00:00
|
|
|
uint64_t last_fadvise_size = 0;
|
2018-01-22 22:37:37 +00:00
|
|
|
bool skip_filters;
|
2021-08-21 03:39:52 +00:00
|
|
|
std::string db_session_id;
|
|
|
|
uint64_t next_file_number = 1;
|
2024-02-14 04:30:07 +00:00
|
|
|
size_t ts_sz;
|
|
|
|
bool strip_timestamp;
|
2021-08-21 03:39:52 +00:00
|
|
|
|
2021-09-10 01:57:01 +00:00
|
|
|
Status AddImpl(const Slice& user_key, const Slice& value,
|
|
|
|
ValueType value_type) {
|
2017-05-26 19:05:19 +00:00
|
|
|
if (!builder) {
|
|
|
|
return Status::InvalidArgument("File is not opened");
|
|
|
|
}
|
2024-01-25 18:44:00 +00:00
|
|
|
if (!builder->status().ok()) {
|
|
|
|
return builder->status();
|
|
|
|
}
|
2017-05-26 19:05:19 +00:00
|
|
|
|
2024-02-14 04:30:07 +00:00
|
|
|
assert(user_key.size() >= ts_sz);
|
|
|
|
if (strip_timestamp) {
|
|
|
|
// In this mode, we expect users to always provide a min timestamp.
|
|
|
|
if (internal_comparator.user_comparator()->CompareTimestamp(
|
|
|
|
Slice(user_key.data() + user_key.size() - ts_sz, ts_sz),
|
|
|
|
MinU64Ts()) != 0) {
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"persist_user_defined_timestamps flag is set to false, only "
|
|
|
|
"minimum timestamp is accepted.");
|
|
|
|
}
|
|
|
|
}
|
2017-05-26 19:05:19 +00:00
|
|
|
if (file_info.num_entries == 0) {
|
|
|
|
file_info.smallest_key.assign(user_key.data(), user_key.size());
|
|
|
|
} else {
|
|
|
|
if (internal_comparator.user_comparator()->Compare(
|
|
|
|
user_key, file_info.largest_key) <= 0) {
|
|
|
|
// Make sure that keys are added in order
|
2020-01-06 18:55:53 +00:00
|
|
|
return Status::InvalidArgument(
|
|
|
|
"Keys must be added in strict ascending order.");
|
2017-05-26 19:05:19 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-09-10 01:57:01 +00:00
|
|
|
assert(value_type == kTypeValue || value_type == kTypeMerge ||
|
|
|
|
value_type == kTypeDeletion ||
|
2023-08-11 01:16:10 +00:00
|
|
|
value_type == kTypeDeletionWithTimestamp ||
|
|
|
|
value_type == kTypeWideColumnEntity);
|
2021-09-10 01:57:01 +00:00
|
|
|
|
|
|
|
constexpr SequenceNumber sequence_number = 0;
|
|
|
|
|
|
|
|
ikey.Set(user_key, sequence_number, value_type);
|
|
|
|
|
2017-05-26 19:05:19 +00:00
|
|
|
builder->Add(ikey.Encode(), value);
|
|
|
|
|
|
|
|
// update file info
|
|
|
|
file_info.num_entries++;
|
|
|
|
file_info.largest_key.assign(user_key.data(), user_key.size());
|
|
|
|
file_info.file_size = builder->FileSize();
|
|
|
|
|
2021-04-06 18:53:54 +00:00
|
|
|
InvalidatePageCache(false /* closing */).PermitUncheckedError();
|
2024-01-25 18:44:00 +00:00
|
|
|
return builder->status();
|
2017-05-26 19:05:19 +00:00
|
|
|
}
|
|
|
|
|
2021-09-10 01:57:01 +00:00
|
|
|
Status Add(const Slice& user_key, const Slice& value, ValueType value_type) {
|
2022-07-14 17:09:31 +00:00
|
|
|
if (internal_comparator.user_comparator()->timestamp_size() != 0) {
|
2021-09-10 01:57:01 +00:00
|
|
|
return Status::InvalidArgument("Timestamp size mismatch");
|
|
|
|
}
|
|
|
|
|
|
|
|
return AddImpl(user_key, value, value_type);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status Add(const Slice& user_key, const Slice& timestamp, const Slice& value,
|
|
|
|
ValueType value_type) {
|
|
|
|
const size_t timestamp_size = timestamp.size();
|
|
|
|
|
2022-07-14 17:09:31 +00:00
|
|
|
if (internal_comparator.user_comparator()->timestamp_size() !=
|
|
|
|
timestamp_size) {
|
2021-09-10 01:57:01 +00:00
|
|
|
return Status::InvalidArgument("Timestamp size mismatch");
|
|
|
|
}
|
|
|
|
|
|
|
|
const size_t user_key_size = user_key.size();
|
|
|
|
|
|
|
|
if (user_key.data() + user_key_size == timestamp.data()) {
|
|
|
|
Slice user_key_with_ts(user_key.data(), user_key_size + timestamp_size);
|
|
|
|
return AddImpl(user_key_with_ts, value, value_type);
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string user_key_with_ts;
|
|
|
|
user_key_with_ts.reserve(user_key_size + timestamp_size);
|
|
|
|
user_key_with_ts.append(user_key.data(), user_key_size);
|
|
|
|
user_key_with_ts.append(timestamp.data(), timestamp_size);
|
|
|
|
|
|
|
|
return AddImpl(user_key_with_ts, value, value_type);
|
|
|
|
}
|
|
|
|
|
2023-08-11 01:16:10 +00:00
|
|
|
Status AddEntity(const Slice& user_key, const WideColumns& columns) {
|
|
|
|
WideColumns sorted_columns(columns);
|
2023-09-12 19:36:07 +00:00
|
|
|
WideColumnsHelper::SortColumns(sorted_columns);
|
2023-08-11 01:16:10 +00:00
|
|
|
|
|
|
|
std::string entity;
|
|
|
|
const Status s = WideColumnSerialization::Serialize(sorted_columns, entity);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
if (entity.size() > size_t{std::numeric_limits<uint32_t>::max()}) {
|
|
|
|
return Status::InvalidArgument("wide column entity is too large");
|
|
|
|
}
|
|
|
|
return Add(user_key, entity, kTypeWideColumnEntity);
|
|
|
|
}
|
|
|
|
|
2022-09-30 23:13:03 +00:00
|
|
|
Status DeleteRangeImpl(const Slice& begin_key, const Slice& end_key) {
|
2018-07-14 05:40:23 +00:00
|
|
|
if (!builder) {
|
|
|
|
return Status::InvalidArgument("File is not opened");
|
|
|
|
}
|
2023-03-23 04:03:13 +00:00
|
|
|
int cmp = internal_comparator.user_comparator()->CompareWithoutTimestamp(
|
|
|
|
begin_key, end_key);
|
|
|
|
if (cmp > 0) {
|
|
|
|
// It's an empty range where endpoints appear mistaken. Don't bother
|
|
|
|
// applying it to the DB, and return an error to the user.
|
|
|
|
return Status::InvalidArgument("end key comes before start key");
|
|
|
|
} else if (cmp == 0) {
|
|
|
|
// It's an empty range. Don't bother applying it to the DB.
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
2024-02-14 04:30:07 +00:00
|
|
|
assert(begin_key.size() >= ts_sz);
|
|
|
|
assert(end_key.size() >= ts_sz);
|
|
|
|
Slice begin_key_ts =
|
|
|
|
Slice(begin_key.data() + begin_key.size() - ts_sz, ts_sz);
|
|
|
|
Slice end_key_ts = Slice(end_key.data() + end_key.size() - ts_sz, ts_sz);
|
|
|
|
assert(begin_key_ts.compare(end_key_ts) == 0);
|
|
|
|
if (strip_timestamp) {
|
|
|
|
// In this mode, we expect users to always provide a min timestamp.
|
|
|
|
if (internal_comparator.user_comparator()->CompareTimestamp(
|
|
|
|
begin_key_ts, MinU64Ts()) != 0) {
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"persist_user_defined_timestamps flag is set to false, only "
|
|
|
|
"minimum timestamp is accepted for start key.");
|
|
|
|
}
|
|
|
|
if (internal_comparator.user_comparator()->CompareTimestamp(
|
|
|
|
end_key_ts, MinU64Ts()) != 0) {
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"persist_user_defined_timestamps flag is set to false, only "
|
|
|
|
"minimum timestamp is accepted for end key.");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-07-14 05:40:23 +00:00
|
|
|
RangeTombstone tombstone(begin_key, end_key, 0 /* Sequence Number */);
|
|
|
|
if (file_info.num_range_del_entries == 0) {
|
|
|
|
file_info.smallest_range_del_key.assign(tombstone.start_key_.data(),
|
|
|
|
tombstone.start_key_.size());
|
|
|
|
file_info.largest_range_del_key.assign(tombstone.end_key_.data(),
|
|
|
|
tombstone.end_key_.size());
|
|
|
|
} else {
|
|
|
|
if (internal_comparator.user_comparator()->Compare(
|
|
|
|
tombstone.start_key_, file_info.smallest_range_del_key) < 0) {
|
|
|
|
file_info.smallest_range_del_key.assign(tombstone.start_key_.data(),
|
|
|
|
tombstone.start_key_.size());
|
|
|
|
}
|
|
|
|
if (internal_comparator.user_comparator()->Compare(
|
|
|
|
tombstone.end_key_, file_info.largest_range_del_key) > 0) {
|
|
|
|
file_info.largest_range_del_key.assign(tombstone.end_key_.data(),
|
|
|
|
tombstone.end_key_.size());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
auto ikey_and_end_key = tombstone.Serialize();
|
|
|
|
builder->Add(ikey_and_end_key.first.Encode(), ikey_and_end_key.second);
|
|
|
|
|
|
|
|
// update file info
|
|
|
|
file_info.num_range_del_entries++;
|
|
|
|
file_info.file_size = builder->FileSize();
|
|
|
|
|
2021-04-06 18:53:54 +00:00
|
|
|
InvalidatePageCache(false /* closing */).PermitUncheckedError();
|
|
|
|
return Status::OK();
|
2018-07-14 05:40:23 +00:00
|
|
|
}
|
|
|
|
|
2022-09-30 23:13:03 +00:00
|
|
|
Status DeleteRange(const Slice& begin_key, const Slice& end_key) {
|
|
|
|
if (internal_comparator.user_comparator()->timestamp_size() != 0) {
|
|
|
|
return Status::InvalidArgument("Timestamp size mismatch");
|
|
|
|
}
|
|
|
|
return DeleteRangeImpl(begin_key, end_key);
|
|
|
|
}
|
|
|
|
|
|
|
|
// begin_key and end_key should be users keys without timestamp.
|
|
|
|
Status DeleteRange(const Slice& begin_key, const Slice& end_key,
|
|
|
|
const Slice& timestamp) {
|
|
|
|
const size_t timestamp_size = timestamp.size();
|
|
|
|
|
|
|
|
if (internal_comparator.user_comparator()->timestamp_size() !=
|
|
|
|
timestamp_size) {
|
|
|
|
return Status::InvalidArgument("Timestamp size mismatch");
|
|
|
|
}
|
|
|
|
|
|
|
|
const size_t begin_key_size = begin_key.size();
|
|
|
|
const size_t end_key_size = end_key.size();
|
|
|
|
if (begin_key.data() + begin_key_size == timestamp.data() ||
|
|
|
|
end_key.data() + begin_key_size == timestamp.data()) {
|
|
|
|
assert(memcmp(begin_key.data() + begin_key_size,
|
|
|
|
end_key.data() + end_key_size, timestamp_size) == 0);
|
|
|
|
Slice begin_key_with_ts(begin_key.data(),
|
|
|
|
begin_key_size + timestamp_size);
|
|
|
|
Slice end_key_with_ts(end_key.data(), end_key.size() + timestamp_size);
|
|
|
|
return DeleteRangeImpl(begin_key_with_ts, end_key_with_ts);
|
|
|
|
}
|
|
|
|
std::string begin_key_with_ts;
|
|
|
|
begin_key_with_ts.reserve(begin_key_size + timestamp_size);
|
|
|
|
begin_key_with_ts.append(begin_key.data(), begin_key_size);
|
|
|
|
begin_key_with_ts.append(timestamp.data(), timestamp_size);
|
|
|
|
std::string end_key_with_ts;
|
|
|
|
end_key_with_ts.reserve(end_key_size + timestamp_size);
|
|
|
|
end_key_with_ts.append(end_key.data(), end_key_size);
|
|
|
|
end_key_with_ts.append(timestamp.data(), timestamp_size);
|
|
|
|
return DeleteRangeImpl(begin_key_with_ts, end_key_with_ts);
|
|
|
|
}
|
|
|
|
|
2020-09-28 22:11:54 +00:00
|
|
|
Status InvalidatePageCache(bool closing) {
|
|
|
|
Status s = Status::OK();
|
2017-05-26 19:05:19 +00:00
|
|
|
if (invalidate_page_cache == false) {
|
|
|
|
// Fadvise disabled
|
2020-09-28 22:11:54 +00:00
|
|
|
return s;
|
2017-05-26 19:05:19 +00:00
|
|
|
}
|
2022-10-25 18:50:38 +00:00
|
|
|
uint64_t bytes_since_last_fadvise = builder->FileSize() - last_fadvise_size;
|
2017-05-26 19:05:19 +00:00
|
|
|
if (bytes_since_last_fadvise > kFadviseTrigger || closing) {
|
|
|
|
TEST_SYNC_POINT_CALLBACK("SstFileWriter::Rep::InvalidatePageCache",
|
|
|
|
&(bytes_since_last_fadvise));
|
2020-02-28 22:10:51 +00:00
|
|
|
// Tell the OS that we don't need this file in page cache
|
2020-09-28 22:11:54 +00:00
|
|
|
s = file_writer->InvalidateCache(0, 0);
|
|
|
|
if (s.IsNotSupported()) {
|
|
|
|
// NotSupported is fine as it could be a file type that doesn't use page
|
|
|
|
// cache.
|
|
|
|
s = Status::OK();
|
|
|
|
}
|
2017-05-26 19:05:19 +00:00
|
|
|
last_fadvise_size = builder->FileSize();
|
|
|
|
}
|
2020-09-28 22:11:54 +00:00
|
|
|
return s;
|
2017-05-26 19:05:19 +00:00
|
|
|
}
|
2015-09-23 19:42:43 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
SstFileWriter::SstFileWriter(const EnvOptions& env_options,
|
2016-05-17 20:11:56 +00:00
|
|
|
const Options& options,
|
2016-12-05 22:16:23 +00:00
|
|
|
const Comparator* user_comparator,
|
2016-12-29 22:51:52 +00:00
|
|
|
ColumnFamilyHandle* column_family,
|
2017-05-23 18:36:06 +00:00
|
|
|
bool invalidate_page_cache,
|
2018-01-22 22:37:37 +00:00
|
|
|
Env::IOPriority io_priority, bool skip_filters)
|
2017-05-23 18:36:06 +00:00
|
|
|
: rep_(new Rep(env_options, options, io_priority, user_comparator,
|
2021-08-21 03:39:52 +00:00
|
|
|
column_family, invalidate_page_cache, skip_filters,
|
|
|
|
DBImpl::GenerateDbSessionId(options.env))) {
|
|
|
|
// SstFileWriter is used to create sst files that can be added to database
|
|
|
|
// later. Therefore, no real db_id and db_session_id are associated with it.
|
|
|
|
// Here we mimic the way db_session_id behaves by getting a db_session_id
|
|
|
|
// for each SstFileWriter, and (later below) assign unique file numbers
|
|
|
|
// in the table properties. The db_id is set to be "SST Writer" for clarity.
|
|
|
|
|
2016-12-29 22:51:52 +00:00
|
|
|
rep_->file_info.file_size = 0;
|
|
|
|
}
|
2015-09-23 19:42:43 +00:00
|
|
|
|
2016-11-13 04:03:39 +00:00
|
|
|
SstFileWriter::~SstFileWriter() {
|
|
|
|
if (rep_->builder) {
|
|
|
|
// User did not call Finish() or Finish() failed, we need to
|
|
|
|
// abandon the builder.
|
|
|
|
rep_->builder->Abandon();
|
|
|
|
}
|
|
|
|
}
|
2015-09-23 19:42:43 +00:00
|
|
|
|
Fix/improve temperature handling for file ingestion (#12402)
Summary:
Partly following up on leftovers from https://github.com/facebook/rocksdb/issues/12388
In terms of public API:
* Make it clear that IngestExternalFileArg::file_temperature is just a hint for opening the existing file, though it was previously used for both copy-from temp hint and copy-to temp, which was bizarre.
* Specify how IngestExternalFile assigns temperature to file ingested into DB. (See details in comments.) This approach is not perfect in terms of matching how the DB assigns temperatures, but was the simplest way to get close. The key complication for matching DB temperature assignments is that ingestion files are copied (to a destination temp) before their target level is determined (in general).
* Add a temperature option to SstFileWriter::Open so that files intended for ingestion can be initially written to a chosen temperature.
* Note that "fail_if_not_bottommost_level" is obsolete/confusing use of "bottommost"
In terms of the implementation, there was a similar bit of oddness with the internal CopyFile API, which only took one temperature, ambiguously applicable to the source, destination, or both. This is also fixed.
Eventual suggested follow-up:
* Before copying files for ingestion, determine a tentative level assignment to use for destination temperature, and keep that even if final level assignment happens to be different at commit time (rare).
* More temperature handling for CreateColumnFamilyWithImport and Checkpoints.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12402
Test Plan:
Deeply revamped
ExternalSSTFileBasicTest.IngestWithTemperature to test the new changes. Previously this test was insufficient because it was only looking at temperatures according to the DB manifest. Incorporating FileTemperatureTestFS allows us to also test the temperatures in the storage layer.
Used macros instead of functions for better tracing to critical source location on test failures.
Some enhancements to FileTemperatureTestFS in the process of developing the revamped test.
Reviewed By: jowlyzhang
Differential Revision: D54442794
Pulled By: pdillinger
fbshipit-source-id: 41d9d0afdc073e6a983304c10bbc07c70cc7e995
2024-03-06 00:56:08 +00:00
|
|
|
Status SstFileWriter::Open(const std::string& file_path, Temperature temp) {
|
2017-05-23 18:36:06 +00:00
|
|
|
Rep* r = rep_.get();
|
2015-09-23 19:42:43 +00:00
|
|
|
Status s;
|
2021-01-29 06:08:46 +00:00
|
|
|
std::unique_ptr<FSWritableFile> sst_file;
|
2021-02-11 06:18:33 +00:00
|
|
|
FileOptions cur_file_opts(r->env_options);
|
Fix/improve temperature handling for file ingestion (#12402)
Summary:
Partly following up on leftovers from https://github.com/facebook/rocksdb/issues/12388
In terms of public API:
* Make it clear that IngestExternalFileArg::file_temperature is just a hint for opening the existing file, though it was previously used for both copy-from temp hint and copy-to temp, which was bizarre.
* Specify how IngestExternalFile assigns temperature to file ingested into DB. (See details in comments.) This approach is not perfect in terms of matching how the DB assigns temperatures, but was the simplest way to get close. The key complication for matching DB temperature assignments is that ingestion files are copied (to a destination temp) before their target level is determined (in general).
* Add a temperature option to SstFileWriter::Open so that files intended for ingestion can be initially written to a chosen temperature.
* Note that "fail_if_not_bottommost_level" is obsolete/confusing use of "bottommost"
In terms of the implementation, there was a similar bit of oddness with the internal CopyFile API, which only took one temperature, ambiguously applicable to the source, destination, or both. This is also fixed.
Eventual suggested follow-up:
* Before copying files for ingestion, determine a tentative level assignment to use for destination temperature, and keep that even if final level assignment happens to be different at commit time (rare).
* More temperature handling for CreateColumnFamilyWithImport and Checkpoints.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12402
Test Plan:
Deeply revamped
ExternalSSTFileBasicTest.IngestWithTemperature to test the new changes. Previously this test was insufficient because it was only looking at temperatures according to the DB manifest. Incorporating FileTemperatureTestFS allows us to also test the temperatures in the storage layer.
Used macros instead of functions for better tracing to critical source location on test failures.
Some enhancements to FileTemperatureTestFS in the process of developing the revamped test.
Reviewed By: jowlyzhang
Differential Revision: D54442794
Pulled By: pdillinger
fbshipit-source-id: 41d9d0afdc073e6a983304c10bbc07c70cc7e995
2024-03-06 00:56:08 +00:00
|
|
|
cur_file_opts.temperature = temp;
|
2021-01-29 06:08:46 +00:00
|
|
|
s = r->ioptions.env->GetFileSystem()->NewWritableFile(
|
2021-02-11 06:18:33 +00:00
|
|
|
file_path, cur_file_opts, &sst_file, nullptr);
|
2015-09-23 19:42:43 +00:00
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2017-05-23 18:36:06 +00:00
|
|
|
sst_file->SetIOPriority(r->io_priority);
|
|
|
|
|
2016-06-20 18:26:25 +00:00
|
|
|
CompressionType compression_type;
|
2018-06-28 00:34:07 +00:00
|
|
|
CompressionOptions compression_opts;
|
2020-03-31 19:08:41 +00:00
|
|
|
if (r->mutable_cf_options.bottommost_compression !=
|
|
|
|
kDisableCompressionOption) {
|
|
|
|
compression_type = r->mutable_cf_options.bottommost_compression;
|
|
|
|
if (r->mutable_cf_options.bottommost_compression_opts.enabled) {
|
|
|
|
compression_opts = r->mutable_cf_options.bottommost_compression_opts;
|
2018-06-28 00:34:07 +00:00
|
|
|
} else {
|
2020-03-31 19:08:41 +00:00
|
|
|
compression_opts = r->mutable_cf_options.compression_opts;
|
2018-06-28 00:34:07 +00:00
|
|
|
}
|
2022-03-08 02:06:19 +00:00
|
|
|
} else if (!r->mutable_cf_options.compression_per_level.empty()) {
|
2015-09-23 19:42:43 +00:00
|
|
|
// Use the compression of the last level if we have per level compression
|
2022-03-08 02:06:19 +00:00
|
|
|
compression_type = *(r->mutable_cf_options.compression_per_level.rbegin());
|
2020-03-31 19:08:41 +00:00
|
|
|
compression_opts = r->mutable_cf_options.compression_opts;
|
2016-06-20 18:26:25 +00:00
|
|
|
} else {
|
|
|
|
compression_type = r->mutable_cf_options.compression;
|
2020-03-31 19:08:41 +00:00
|
|
|
compression_opts = r->mutable_cf_options.compression_opts;
|
2015-09-23 19:42:43 +00:00
|
|
|
}
|
|
|
|
|
2024-02-02 22:14:43 +00:00
|
|
|
InternalTblPropCollFactories internal_tbl_prop_coll_factories;
|
2016-08-19 23:17:56 +00:00
|
|
|
|
|
|
|
// SstFileWriter properties collector to add SstFileWriter version.
|
2024-02-02 22:14:43 +00:00
|
|
|
internal_tbl_prop_coll_factories.emplace_back(
|
2016-10-18 23:59:37 +00:00
|
|
|
new SstFileWriterPropertiesCollectorFactory(2 /* version */,
|
|
|
|
0 /* global_seqno*/));
|
2015-09-23 19:42:43 +00:00
|
|
|
|
2016-08-19 23:17:56 +00:00
|
|
|
// User collector factories
|
|
|
|
auto user_collector_factories =
|
|
|
|
r->ioptions.table_properties_collector_factories;
|
|
|
|
for (size_t i = 0; i < user_collector_factories.size(); i++) {
|
2024-02-02 22:14:43 +00:00
|
|
|
internal_tbl_prop_coll_factories.emplace_back(
|
2016-08-19 23:17:56 +00:00
|
|
|
new UserKeyTablePropertiesCollectorFactory(
|
|
|
|
user_collector_factories[i]));
|
|
|
|
}
|
2016-09-18 05:30:43 +00:00
|
|
|
int unknown_level = -1;
|
2016-12-05 22:16:23 +00:00
|
|
|
uint32_t cf_id;
|
|
|
|
|
|
|
|
if (r->cfh != nullptr) {
|
|
|
|
// user explicitly specified that this file will be ingested into cfh,
|
|
|
|
// we can persist this information in the file.
|
|
|
|
cf_id = r->cfh->GetID();
|
|
|
|
r->column_family_name = r->cfh->GetName();
|
|
|
|
} else {
|
|
|
|
r->column_family_name = "";
|
|
|
|
cf_id = TablePropertiesCollectorFactory::Context::kUnknownColumnFamily;
|
|
|
|
}
|
2022-07-15 04:49:34 +00:00
|
|
|
|
|
|
|
// TODO: it would be better to set oldest_key_time to be used for getting the
|
|
|
|
// approximate time of ingested keys.
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
// TODO: plumb Env::IOActivity, Env::IOPriority
|
2015-09-23 19:42:43 +00:00
|
|
|
TableBuilderOptions table_builder_options(
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
r->ioptions, r->mutable_cf_options, ReadOptions(), r->write_options,
|
2024-02-02 22:14:43 +00:00
|
|
|
r->internal_comparator, &internal_tbl_prop_coll_factories,
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
compression_type, compression_opts, cf_id, r->column_family_name,
|
|
|
|
unknown_level, false /* is_bottommost */, TableFileCreationReason::kMisc,
|
|
|
|
0 /* oldest_key_time */, 0 /* file_creation_time */,
|
|
|
|
"SST Writer" /* db_id */, r->db_session_id, 0 /* target_file_size */,
|
|
|
|
r->next_file_number);
|
2021-08-21 03:39:52 +00:00
|
|
|
// External SST files used to each get a unique session id. Now for
|
|
|
|
// slightly better uniqueness probability in constructing cache keys, we
|
|
|
|
// assign fake file numbers to each file (into table properties) and keep
|
|
|
|
// the same session id for the life of the SstFileWriter.
|
|
|
|
r->next_file_number++;
|
Add more LSM info to FilterBuildingContext (#8246)
Summary:
Add `num_levels`, `is_bottommost`, and table file creation
`reason` to `FilterBuildingContext`, in anticipation of more powerful
Bloom-like filter support.
To support this, added `is_bottommost` and `reason` to
`TableBuilderOptions`, which allowed removing `reason` parameter from
`rocksdb::BuildTable`.
I attempted to remove `skip_filters` from `TableBuilderOptions`, because
filter construction decisions should arise from options, not one-off
parameters. I could not completely remove it because the public API for
SstFileWriter takes a `skip_filters` parameter, and translating this
into an option change would mean awkwardly replacing the table_factory
if it is BlockBasedTableFactory with new filter_policy=nullptr option.
I marked this public skip_filters option as deprecated because of this
oddity. (skip_filters on the read side probably makes sense.)
At least `skip_filters` is now largely hidden for users of
`TableBuilderOptions` and is no longer used for implementing the
optimize_filters_for_hits option. Bringing the logic for that option
closer to handling of FilterBuildingContext makes it more obvious that
hese two are using the same notion of "bottommost." (Planned:
configuration options for Bloom-like filters that generalize
`optimize_filters_for_hits`)
Recommended follow-up: Try to get away from "bottommost level" naming of
things, which is inaccurate (see
VersionStorageInfo::RangeMightExistAfterSortedRun), and move to
"bottommost run" or just "bottommost."
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8246
Test Plan:
extended an existing unit test to exercise and check various
filter building contexts. Also, existing tests for
optimize_filters_for_hits validate some of the "bottommost" handling,
which is now closely connected to FilterBuildingContext::is_bottommost
through TableBuilderOptions::is_bottommost
Reviewed By: mrambacher
Differential Revision: D28099346
Pulled By: pdillinger
fbshipit-source-id: 2c1072e29c24d4ac404c761a7b7663292372600a
2021-04-30 20:49:24 +00:00
|
|
|
// XXX: when we can remove skip_filters from the SstFileWriter public API
|
|
|
|
// we can remove it from TableBuilderOptions.
|
|
|
|
table_builder_options.skip_filters = r->skip_filters;
|
2021-02-11 06:18:33 +00:00
|
|
|
FileTypeSet tmp_set = r->ioptions.checksum_handoff_file_types;
|
2020-05-20 18:53:49 +00:00
|
|
|
r->file_writer.reset(new WritableFileWriter(
|
2021-03-15 11:32:24 +00:00
|
|
|
std::move(sst_file), file_path, r->env_options, r->ioptions.clock,
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
nullptr /* io_tracer */, r->ioptions.stats, Histograms::SST_WRITE_MICROS,
|
|
|
|
r->ioptions.listeners, r->ioptions.file_checksum_gen_factory.get(),
|
Using existing crc32c checksum in checksum handoff for Manifest and WAL (#8412)
Summary:
In PR https://github.com/facebook/rocksdb/issues/7523 , checksum handoff is introduced in RocksDB for WAL, Manifest, and SST files. When user enable checksum handoff for a certain type of file, before the data is written to the lower layer storage system, we calculate the checksum (crc32c) of each piece of data and pass the checksum down with the data, such that data verification can be down by the lower layer storage system if it has the capability. However, it cannot cover the whole lifetime of the data in the memory and also it potentially introduces extra checksum calculation overhead.
In this PR, we introduce a new interface in WritableFileWriter::Append, which allows the caller be able to pass the data and the checksum (crc32c) together. In this way, WritableFileWriter can directly use the pass-in checksum (crc32c) to generate the checksum of data being passed down to the storage system. It saves the calculation overhead and achieves higher protection coverage. When a new checksum is added with the data, we use Crc32cCombine https://github.com/facebook/rocksdb/issues/8305 to combine the existing checksum and the new checksum. To avoid the segmenting of data by rate-limiter before it is stored, rate-limiter is called enough times to accumulate enough credits for a certain write. This design only support Manifest and WAL which use log_writer in the current stage.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8412
Test Plan: make check, add new testing cases.
Reviewed By: anand1976
Differential Revision: D29151545
Pulled By: zhichao-cao
fbshipit-source-id: 75e2278c5126cfd58393c67b1efd18dcc7a30772
2021-06-25 07:46:33 +00:00
|
|
|
tmp_set.Contains(FileType::kTableFile), false));
|
2016-10-18 23:59:37 +00:00
|
|
|
|
|
|
|
// TODO(tec) : If table_factory is using compressed block cache, we will
|
|
|
|
// be adding the external sst file blocks into it, which is wasteful.
|
2015-09-23 19:42:43 +00:00
|
|
|
r->builder.reset(r->ioptions.table_factory->NewTableBuilder(
|
2021-04-29 13:59:53 +00:00
|
|
|
table_builder_options, r->file_writer.get()));
|
2015-09-23 19:42:43 +00:00
|
|
|
|
2018-07-14 05:40:23 +00:00
|
|
|
r->file_info = ExternalSstFileInfo();
|
2015-09-23 19:42:43 +00:00
|
|
|
r->file_info.file_path = file_path;
|
2016-10-18 23:59:37 +00:00
|
|
|
r->file_info.version = 2;
|
2015-09-23 19:42:43 +00:00
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
Status SstFileWriter::Add(const Slice& user_key, const Slice& value) {
|
2017-05-26 19:05:19 +00:00
|
|
|
return rep_->Add(user_key, value, ValueType::kTypeValue);
|
|
|
|
}
|
2016-12-29 22:51:52 +00:00
|
|
|
|
2017-05-26 19:05:19 +00:00
|
|
|
Status SstFileWriter::Put(const Slice& user_key, const Slice& value) {
|
|
|
|
return rep_->Add(user_key, value, ValueType::kTypeValue);
|
|
|
|
}
|
2015-09-23 19:42:43 +00:00
|
|
|
|
2021-09-10 01:57:01 +00:00
|
|
|
Status SstFileWriter::Put(const Slice& user_key, const Slice& timestamp,
|
|
|
|
const Slice& value) {
|
|
|
|
return rep_->Add(user_key, timestamp, value, ValueType::kTypeValue);
|
|
|
|
}
|
|
|
|
|
2023-08-11 01:16:10 +00:00
|
|
|
Status SstFileWriter::PutEntity(const Slice& user_key,
|
|
|
|
const WideColumns& columns) {
|
|
|
|
return rep_->AddEntity(user_key, columns);
|
|
|
|
}
|
|
|
|
|
2017-05-26 19:05:19 +00:00
|
|
|
Status SstFileWriter::Merge(const Slice& user_key, const Slice& value) {
|
|
|
|
return rep_->Add(user_key, value, ValueType::kTypeMerge);
|
|
|
|
}
|
2015-09-23 19:42:43 +00:00
|
|
|
|
2017-05-26 19:05:19 +00:00
|
|
|
Status SstFileWriter::Delete(const Slice& user_key) {
|
|
|
|
return rep_->Add(user_key, Slice(), ValueType::kTypeDeletion);
|
2015-09-23 19:42:43 +00:00
|
|
|
}
|
|
|
|
|
2021-09-10 01:57:01 +00:00
|
|
|
Status SstFileWriter::Delete(const Slice& user_key, const Slice& timestamp) {
|
|
|
|
return rep_->Add(user_key, timestamp, Slice(),
|
|
|
|
ValueType::kTypeDeletionWithTimestamp);
|
|
|
|
}
|
|
|
|
|
2018-07-14 05:40:23 +00:00
|
|
|
Status SstFileWriter::DeleteRange(const Slice& begin_key,
|
|
|
|
const Slice& end_key) {
|
|
|
|
return rep_->DeleteRange(begin_key, end_key);
|
|
|
|
}
|
|
|
|
|
2022-09-30 23:13:03 +00:00
|
|
|
Status SstFileWriter::DeleteRange(const Slice& begin_key, const Slice& end_key,
|
|
|
|
const Slice& timestamp) {
|
|
|
|
return rep_->DeleteRange(begin_key, end_key, timestamp);
|
|
|
|
}
|
|
|
|
|
2015-09-23 19:42:43 +00:00
|
|
|
Status SstFileWriter::Finish(ExternalSstFileInfo* file_info) {
|
2017-05-23 18:36:06 +00:00
|
|
|
Rep* r = rep_.get();
|
2015-09-23 19:42:43 +00:00
|
|
|
if (!r->builder) {
|
|
|
|
return Status::InvalidArgument("File is not opened");
|
|
|
|
}
|
2018-07-14 05:40:23 +00:00
|
|
|
if (r->file_info.num_entries == 0 &&
|
|
|
|
r->file_info.num_range_del_entries == 0) {
|
2016-01-25 21:47:07 +00:00
|
|
|
return Status::InvalidArgument("Cannot create sst file with no entries");
|
|
|
|
}
|
2015-09-23 19:42:43 +00:00
|
|
|
|
|
|
|
Status s = r->builder->Finish();
|
2016-12-29 22:51:52 +00:00
|
|
|
r->file_info.file_size = r->builder->FileSize();
|
|
|
|
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
IOOptions opts;
|
2024-01-25 18:44:00 +00:00
|
|
|
if (s.ok()) {
|
|
|
|
s = WritableFileWriter::PrepareIOOptions(r->write_options, opts);
|
|
|
|
}
|
2015-09-23 19:42:43 +00:00
|
|
|
if (s.ok()) {
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
s = r->file_writer->Sync(opts, r->ioptions.use_fsync);
|
2021-04-06 18:53:54 +00:00
|
|
|
r->InvalidatePageCache(true /* closing */).PermitUncheckedError();
|
2015-09-23 19:42:43 +00:00
|
|
|
if (s.ok()) {
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
s = r->file_writer->Close(opts);
|
2015-09-23 19:42:43 +00:00
|
|
|
}
|
|
|
|
}
|
2020-05-20 18:53:49 +00:00
|
|
|
if (s.ok()) {
|
|
|
|
r->file_info.file_checksum = r->file_writer->GetFileChecksum();
|
|
|
|
r->file_info.file_checksum_func_name =
|
|
|
|
r->file_writer->GetFileChecksumFuncName();
|
|
|
|
}
|
2015-09-23 19:42:43 +00:00
|
|
|
if (!s.ok()) {
|
|
|
|
r->ioptions.env->DeleteFile(r->file_info.file_path);
|
|
|
|
}
|
|
|
|
|
2016-12-29 22:51:52 +00:00
|
|
|
if (file_info != nullptr) {
|
2015-09-23 19:42:43 +00:00
|
|
|
*file_info = r->file_info;
|
2024-02-14 04:30:07 +00:00
|
|
|
Slice smallest_key = r->file_info.smallest_key;
|
|
|
|
Slice largest_key = r->file_info.largest_key;
|
|
|
|
Slice smallest_range_del_key = r->file_info.smallest_range_del_key;
|
|
|
|
Slice largest_range_del_key = r->file_info.largest_range_del_key;
|
|
|
|
assert(smallest_key.empty() == largest_key.empty());
|
|
|
|
assert(smallest_range_del_key.empty() == largest_range_del_key.empty());
|
|
|
|
// Remove user-defined timestamps from external file metadata too when they
|
|
|
|
// should not be persisted.
|
|
|
|
if (r->strip_timestamp) {
|
|
|
|
if (!smallest_key.empty()) {
|
|
|
|
assert(smallest_key.size() >= r->ts_sz);
|
|
|
|
assert(largest_key.size() >= r->ts_sz);
|
|
|
|
file_info->smallest_key.resize(smallest_key.size() - r->ts_sz);
|
|
|
|
file_info->largest_key.resize(largest_key.size() - r->ts_sz);
|
|
|
|
}
|
|
|
|
if (!smallest_range_del_key.empty()) {
|
|
|
|
assert(smallest_range_del_key.size() >= r->ts_sz);
|
|
|
|
assert(largest_range_del_key.size() >= r->ts_sz);
|
|
|
|
file_info->smallest_range_del_key.resize(smallest_range_del_key.size() -
|
|
|
|
r->ts_sz);
|
|
|
|
file_info->largest_range_del_key.resize(largest_range_del_key.size() -
|
|
|
|
r->ts_sz);
|
|
|
|
}
|
|
|
|
}
|
2015-09-23 19:42:43 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
r->builder.reset();
|
|
|
|
return s;
|
|
|
|
}
|
2016-12-17 02:26:56 +00:00
|
|
|
|
2022-10-25 18:50:38 +00:00
|
|
|
uint64_t SstFileWriter::FileSize() { return rep_->file_info.file_size; }
|
2017-02-28 19:05:08 +00:00
|
|
|
|
2020-02-20 20:07:53 +00:00
|
|
|
} // namespace ROCKSDB_NAMESPACE
|