2020-06-25 02:30:15 +00:00
|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
//
|
|
|
|
|
|
|
|
#include "table/sst_file_dumper.h"
|
|
|
|
|
|
|
|
#include <chrono>
|
|
|
|
#include <cinttypes>
|
|
|
|
#include <iostream>
|
|
|
|
#include <map>
|
|
|
|
#include <memory>
|
|
|
|
#include <sstream>
|
|
|
|
#include <vector>
|
|
|
|
|
|
|
|
#include "db/blob/blob_index.h"
|
|
|
|
#include "db/memtable.h"
|
Wide Column support in ldb (#11754)
Summary:
wide_columns can now be pretty-printed in the following commands
- `./ldb dump_wal`
- `./ldb dump`
- `./ldb idump`
- `./ldb dump_live_files`
- `./ldb scan`
- `./sst_dump --command=scan`
There are opportunities to refactor to reduce some nearly identical code. This PR is initial change to add wide column support in `ldb` and `sst_dump` tool. More PRs to come for the refactor.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11754
Test Plan:
**New Tests added**
- `WideColumnsHelperTest::DumpWideColumns`
- `WideColumnsHelperTest::DumpSliceAsWideColumns`
**Changes added to existing tests**
- `ExternalSSTFileTest::BasicMixed` added to cover mixed case (This test should have been added in https://github.com/facebook/rocksdb/issues/11688). This test does not verify the ldb or sst_dump output. This test was used to create test SST files having some rows with wide columns and some without and the generated SST files were used to manually test sst_dump_tool.
- `createSST()` in `sst_dump_test` now takes `wide_column_one_in` to add wide column value in SST
**dump_wal**
```
./ldb dump_wal --walfile=/tmp/rocksdbtest-226125/db_wide_basic_test_2675429_2308393776696827948/000004.log --print_value --header
```
```
Sequence,Count,ByteSize,Physical Offset,Key(s) : value
1,1,59,0,PUT_ENTITY(0) : 0x:0x68656C6C6F 0x617474725F6E616D6531:0x666F6F 0x617474725F6E616D6532:0x626172
2,1,34,42,PUT_ENTITY(0) : 0x617474725F6F6E65:0x74776F 0x617474725F7468726565:0x666F7572
3,1,17,7d,PUT(0) : 0x7468697264 : 0x62617A
```
**idump**
```
./ldb --db=/tmp/rocksdbtest-226125/db_wide_basic_test_3481961_2308393776696827948/ idump
```
```
'first' seq:1, type:22 => :hello attr_name1:foo attr_name2:bar
'second' seq:2, type:22 => attr_one:two attr_three:four
'third' seq:3, type:1 => baz
Internal keys in range: 3
```
**SST Dump from dump_live_files**
```
./ldb --db=/tmp/rocksdbtest-226125/db_wide_basic_test_3481961_2308393776696827948/ compact
./ldb --db=/tmp/rocksdbtest-226125/db_wide_basic_test_3481961_2308393776696827948/ dump_live_files
```
```
...
==============================
SST Files
==============================
/tmp/rocksdbtest-226125/db_wide_basic_test_3481961_2308393776696827948/000013.sst level:1
------------------------------
Process /tmp/rocksdbtest-226125/db_wide_basic_test_3481961_2308393776696827948/000013.sst
Sst file format: block-based
'first' seq:0, type:22 => :hello attr_name1:foo attr_name2:bar
'second' seq:0, type:22 => attr_one:two attr_three:four
'third' seq:0, type:1 => baz
...
```
**dump**
```
./ldb --db=/tmp/rocksdbtest-226125/db_wide_basic_test_3481961_2308393776696827948/ dump
```
```
first ==> :hello attr_name1:foo attr_name2:bar
second ==> attr_one:two attr_three:four
third ==> baz
Keys in range: 3
```
**scan**
```
./ldb --db=/tmp/rocksdbtest-226125/db_wide_basic_test_3481961_2308393776696827948/ scan
```
```
first : :hello attr_name1:foo attr_name2:bar
second : attr_one:two attr_three:four
third : baz
```
**sst_dump**
```
./sst_dump --file=/tmp/rocksdbtest-226125/db_wide_basic_test_3481961_2308393776696827948/000013.sst --command=scan
```
```
options.env is 0x7ff54b296000
Process /tmp/rocksdbtest-226125/db_wide_basic_test_3481961_2308393776696827948/000013.sst
Sst file format: block-based
from [] to []
'first' seq:0, type:22 => :hello attr_name1:foo attr_name2:bar
'second' seq:0, type:22 => attr_one:two attr_three:four
'third' seq:0, type:1 => baz
```
Reviewed By: ltamasi
Differential Revision: D48837999
Pulled By: jaykorean
fbshipit-source-id: b0280f0589d2b9716bb9b50530ffcabb397d140f
2023-08-30 19:45:52 +00:00
|
|
|
#include "db/wide/wide_column_serialization.h"
|
|
|
|
#include "db/wide/wide_columns_helper.h"
|
2020-06-25 02:30:15 +00:00
|
|
|
#include "db/write_batch_internal.h"
|
|
|
|
#include "options/cf_options.h"
|
|
|
|
#include "port/port.h"
|
|
|
|
#include "rocksdb/db.h"
|
|
|
|
#include "rocksdb/env.h"
|
|
|
|
#include "rocksdb/iterator.h"
|
|
|
|
#include "rocksdb/slice_transform.h"
|
|
|
|
#include "rocksdb/status.h"
|
|
|
|
#include "rocksdb/table_properties.h"
|
|
|
|
#include "rocksdb/utilities/ldb_cmd.h"
|
|
|
|
#include "table/block_based/block.h"
|
|
|
|
#include "table/block_based/block_based_table_builder.h"
|
|
|
|
#include "table/block_based/block_based_table_factory.h"
|
|
|
|
#include "table/block_based/block_builder.h"
|
|
|
|
#include "table/format.h"
|
|
|
|
#include "table/meta_blocks.h"
|
|
|
|
#include "table/plain/plain_table_factory.h"
|
|
|
|
#include "table/table_reader.h"
|
|
|
|
#include "util/compression.h"
|
|
|
|
#include "util/random.h"
|
2023-08-30 20:42:04 +00:00
|
|
|
#include "util/udt_util.h"
|
2020-06-25 02:30:15 +00:00
|
|
|
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
|
|
|
|
SstFileDumper::SstFileDumper(const Options& options,
|
|
|
|
const std::string& file_path,
|
New backup meta schema, with file temperatures (#9660)
Summary:
The primary goal of this change is to add support for backing up and
restoring (applying on restore) file temperature metadata, without
committing to either the DB manifest or the FS reported "current"
temperatures being exclusive "source of truth".
To achieve this goal, we need to add temperature information to backup
metadata, which requires updated backup meta schema. Fortunately I
prepared for this in https://github.com/facebook/rocksdb/issues/8069, which began forward compatibility in version
6.19.0 for this kind of schema update. (Previously, backup meta schema
was not extensible! Making this schema update public will allow some
other "nice to have" features like taking backups with hard links, and
avoiding crc32c checksum computation when another checksum is already
available.) While schema version 2 is newly public, the default schema
version is still 1. Until we change the default, users will need to set
to 2 to enable features like temperature data backup+restore. New
metadata like temperature information will be ignored with a warning
in versions before this change and since 6.19.0. The metadata is
considered ignorable because a functioning DB can be restored without
it.
Some detail:
* Some renaming because "future schema" is now just public schema 2.
* Initialize some atomics in TestFs (linter reported)
* Add temperature hint support to SstFileDumper (used by BackupEngine)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9660
Test Plan:
related unit test majorly updated for the new functionality,
including some shared testing support for tracking temperatures in a FS.
Some other tests and testing hooks into production code also updated for
making the backup meta schema change public.
Reviewed By: ajkr
Differential Revision: D34686968
Pulled By: pdillinger
fbshipit-source-id: 3ac1fa3e67ee97ca8a5103d79cc87d872c1d862a
2022-03-18 18:06:17 +00:00
|
|
|
Temperature file_temp, size_t readahead_size,
|
|
|
|
bool verify_checksum, bool output_hex,
|
|
|
|
bool decode_blob_index, const EnvOptions& soptions,
|
|
|
|
bool silent)
|
2020-06-25 02:30:15 +00:00
|
|
|
: file_name_(file_path),
|
|
|
|
read_num_(0),
|
New backup meta schema, with file temperatures (#9660)
Summary:
The primary goal of this change is to add support for backing up and
restoring (applying on restore) file temperature metadata, without
committing to either the DB manifest or the FS reported "current"
temperatures being exclusive "source of truth".
To achieve this goal, we need to add temperature information to backup
metadata, which requires updated backup meta schema. Fortunately I
prepared for this in https://github.com/facebook/rocksdb/issues/8069, which began forward compatibility in version
6.19.0 for this kind of schema update. (Previously, backup meta schema
was not extensible! Making this schema update public will allow some
other "nice to have" features like taking backups with hard links, and
avoiding crc32c checksum computation when another checksum is already
available.) While schema version 2 is newly public, the default schema
version is still 1. Until we change the default, users will need to set
to 2 to enable features like temperature data backup+restore. New
metadata like temperature information will be ignored with a warning
in versions before this change and since 6.19.0. The metadata is
considered ignorable because a functioning DB can be restored without
it.
Some detail:
* Some renaming because "future schema" is now just public schema 2.
* Initialize some atomics in TestFs (linter reported)
* Add temperature hint support to SstFileDumper (used by BackupEngine)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9660
Test Plan:
related unit test majorly updated for the new functionality,
including some shared testing support for tracking temperatures in a FS.
Some other tests and testing hooks into production code also updated for
making the backup meta schema change public.
Reviewed By: ajkr
Differential Revision: D34686968
Pulled By: pdillinger
fbshipit-source-id: 3ac1fa3e67ee97ca8a5103d79cc87d872c1d862a
2022-03-18 18:06:17 +00:00
|
|
|
file_temp_(file_temp),
|
2020-06-25 02:30:15 +00:00
|
|
|
output_hex_(output_hex),
|
|
|
|
decode_blob_index_(decode_blob_index),
|
2020-07-09 15:36:41 +00:00
|
|
|
soptions_(soptions),
|
2020-06-25 02:30:15 +00:00
|
|
|
silent_(silent),
|
|
|
|
options_(options),
|
|
|
|
ioptions_(options_),
|
|
|
|
moptions_(ColumnFamilyOptions(options_)),
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
// TODO: plumb Env::IOActivity, Env::IOPriority
|
2020-06-25 02:30:15 +00:00
|
|
|
read_options_(verify_checksum, false),
|
|
|
|
internal_comparator_(BytewiseComparator()) {
|
|
|
|
read_options_.readahead_size = readahead_size;
|
|
|
|
if (!silent_) {
|
|
|
|
fprintf(stdout, "Process %s\n", file_path.c_str());
|
|
|
|
}
|
|
|
|
init_result_ = GetTableReader(file_name_);
|
|
|
|
}
|
|
|
|
|
|
|
|
const char* testFileName = "test_file_name";
|
|
|
|
|
|
|
|
Status SstFileDumper::GetTableReader(const std::string& file_path) {
|
|
|
|
// Warning about 'magic_number' being uninitialized shows up only in UBsan
|
|
|
|
// builds. Though access is guarded by 's.ok()' checks, fix the issue to
|
|
|
|
// avoid any warnings.
|
2021-12-10 16:12:09 +00:00
|
|
|
uint64_t magic_number = Footer::kNullTableMagicNumber;
|
2020-06-25 02:30:15 +00:00
|
|
|
|
|
|
|
// read table magic number
|
|
|
|
Footer footer;
|
|
|
|
|
2021-01-29 06:08:46 +00:00
|
|
|
const auto& fs = options_.env->GetFileSystem();
|
|
|
|
std::unique_ptr<FSRandomAccessFile> file;
|
2020-06-25 02:30:15 +00:00
|
|
|
uint64_t file_size = 0;
|
New backup meta schema, with file temperatures (#9660)
Summary:
The primary goal of this change is to add support for backing up and
restoring (applying on restore) file temperature metadata, without
committing to either the DB manifest or the FS reported "current"
temperatures being exclusive "source of truth".
To achieve this goal, we need to add temperature information to backup
metadata, which requires updated backup meta schema. Fortunately I
prepared for this in https://github.com/facebook/rocksdb/issues/8069, which began forward compatibility in version
6.19.0 for this kind of schema update. (Previously, backup meta schema
was not extensible! Making this schema update public will allow some
other "nice to have" features like taking backups with hard links, and
avoiding crc32c checksum computation when another checksum is already
available.) While schema version 2 is newly public, the default schema
version is still 1. Until we change the default, users will need to set
to 2 to enable features like temperature data backup+restore. New
metadata like temperature information will be ignored with a warning
in versions before this change and since 6.19.0. The metadata is
considered ignorable because a functioning DB can be restored without
it.
Some detail:
* Some renaming because "future schema" is now just public schema 2.
* Initialize some atomics in TestFs (linter reported)
* Add temperature hint support to SstFileDumper (used by BackupEngine)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9660
Test Plan:
related unit test majorly updated for the new functionality,
including some shared testing support for tracking temperatures in a FS.
Some other tests and testing hooks into production code also updated for
making the backup meta schema change public.
Reviewed By: ajkr
Differential Revision: D34686968
Pulled By: pdillinger
fbshipit-source-id: 3ac1fa3e67ee97ca8a5103d79cc87d872c1d862a
2022-03-18 18:06:17 +00:00
|
|
|
FileOptions fopts = soptions_;
|
|
|
|
fopts.temperature = file_temp_;
|
|
|
|
Status s = fs->NewRandomAccessFile(file_path, fopts, &file, nullptr);
|
2020-06-25 02:30:15 +00:00
|
|
|
if (s.ok()) {
|
2024-01-05 17:48:19 +00:00
|
|
|
// check empty file
|
|
|
|
// if true, skip further processing of this file
|
2021-01-29 06:08:46 +00:00
|
|
|
s = fs->GetFileSize(file_path, IOOptions(), &file_size, nullptr);
|
2024-01-05 17:48:19 +00:00
|
|
|
if (s.ok()) {
|
|
|
|
if (file_size == 0) {
|
|
|
|
return Status::Aborted(file_path, "Empty file");
|
|
|
|
}
|
|
|
|
}
|
2020-06-25 02:30:15 +00:00
|
|
|
}
|
|
|
|
|
2021-01-29 06:08:46 +00:00
|
|
|
file_.reset(new RandomAccessFileReader(std::move(file), file_path));
|
2020-06-25 02:30:15 +00:00
|
|
|
|
2024-01-05 17:29:01 +00:00
|
|
|
FilePrefetchBuffer prefetch_buffer(ReadaheadParams(),
|
|
|
|
!fopts.use_mmap_reads /* enable */,
|
|
|
|
false /* track_min_offset */);
|
2020-06-25 02:30:15 +00:00
|
|
|
if (s.ok()) {
|
|
|
|
const uint64_t kSstDumpTailPrefetchSize = 512 * 1024;
|
|
|
|
uint64_t prefetch_size = (file_size > kSstDumpTailPrefetchSize)
|
|
|
|
? kSstDumpTailPrefetchSize
|
|
|
|
: file_size;
|
|
|
|
uint64_t prefetch_off = file_size - prefetch_size;
|
2020-06-29 21:51:57 +00:00
|
|
|
IOOptions opts;
|
2020-09-05 02:25:20 +00:00
|
|
|
s = prefetch_buffer.Prefetch(opts, file_.get(), prefetch_off,
|
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444)
Summary:
**Context/Summary:**
- Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`.
- For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation)
- New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main.
- Misc
- More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority`
- Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444
Test Plan:
- CI fake db crash/stress test
- Microbenchmarking
**Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench`
- google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd
- db_basic_bench_base: upstream
- db_basic_bench_pr: db_basic_bench_base + this PR
- asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read)
- asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR
**Test**
Get
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000
```
Result
```
Coming soon
```
AsyncRead
```
TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out
```
Result
```
Base:
1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008,
PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before):
1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053,
```
Reviewed By: ajkr
Differential Revision: D45918925
Pulled By: hx235
fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
|
|
|
static_cast<size_t>(prefetch_size));
|
2020-06-25 02:30:15 +00:00
|
|
|
|
2022-12-09 18:03:47 +00:00
|
|
|
s = ReadFooterFromFile(opts, file_.get(), *fs, &prefetch_buffer, file_size,
|
2020-06-29 21:51:57 +00:00
|
|
|
&footer);
|
2020-06-25 02:30:15 +00:00
|
|
|
}
|
|
|
|
if (s.ok()) {
|
|
|
|
magic_number = footer.table_magic_number();
|
|
|
|
}
|
|
|
|
|
|
|
|
if (s.ok()) {
|
|
|
|
if (magic_number == kPlainTableMagicNumber ||
|
2023-11-30 16:06:37 +00:00
|
|
|
magic_number == kLegacyPlainTableMagicNumber ||
|
|
|
|
magic_number == kCuckooTableMagicNumber) {
|
2020-06-25 02:30:15 +00:00
|
|
|
soptions_.use_mmap_reads = true;
|
Fix a bug in sst_dump when parsing PlainTable (#12223)
Summary:
### Summary: The sst_dump tool occur IO Error when reading data in PlainTable, as shown in the follow
```bash
❯ ./sst_dump --file=/tmp/write_example --command=scan --show_properties --verify_checksum
options.env is 0x60000282dc00
Process /tmp/write_example/001630.sst
Sst file format: plain table
/tmp/filepicker_example/001630.sst: IO error: While pread offset 0 len 758: /tmp/filepicker_example/001630.sst: Bad address
Process /tmp/filepicker_example/001624.sst
```
#### Reason
The root cause is that `fopts.use_mmap_reads` is false, `NewRandomAccessFile` will produce an `PosixRandomAccessFile` file. but `soptions_.use_mmap_reads` is true, This will result in unexpected calls in the `MmapDataIfNeeded` function.
```c++
Status SstFileDumper::GetTableReader(const std::string& file_path) {
...
if (s.ok()) {
if (magic_number == kPlainTableMagicNumber ||
magic_number == kLegacyPlainTableMagicNumber ||
magic_number == kCuckooTableMagicNumber) {
soptions_.use_mmap_reads = true;
...
// WARN: fopts.use_mmap_reads is false
fs->NewRandomAccessFile(file_path, fopts, &file, nullptr);
file_.reset(new RandomAccessFileReader(std::move(file), file_path));
}
...
}
if (s.ok()) {
// soptions_.use_mmap_reads is true
s = NewTableReader(ioptions_, soptions_, internal_comparator_, file_size,
&table_reader_);
}
return s;
}
```
The following read logic was executed on a `PosixRandomAccessFile` file, Eventually, `PosixRandomAccessFile::Read` will be called with a `nullptr` `scratch`
```c++
Status PlainTableReader::MmapDataIfNeeded() {
if (file_info_.is_mmap_mode) {
// Get mmapped memory.
// Executing the following logic on the PosixRandomAccessFile file is incorrect
return file_info_.file->Read(
IOOptions(), 0, static_cast<size_t>(file_size_), &file_info_.file_data,
nullptr, nullptr, Env::IO_TOTAL /* rate_limiter_priority */);
}
return Status::OK();
}
```
#### Fix:
When parsing PlainTable, set the variable `fopts.use_mmap_reads` equal `soptions_.use_mmap_reads`, When the `soptions_.use_mmap_reads` is true, `NewRandomAccessFile` will produce an `PosixMmapReadableFile` file. This will work correctly in the `MmapDataIfNeeded` function
```
❯ ./sst_dump --file=/tmp/write_example --command=scan --show_properties --verify_checksum
options.env is 0x6000009323e0
Process /tmp/write_example/001630.sst
Sst file format: plain table
from [] to []
'keys496' seq:0, type:1 => values1496
'keys497' seq:0, type:1 => values1497
'keys498' seq:0, type:1 => values1498
Table Properties:
------------------------------
# data blocks: 1
# entries: 3
# deletions: 0
# merge operands: 0
# range deletions: 0
raw key size: 45
raw average key size: 15.000000
raw value size: 42
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12223
Reviewed By: cbi42
Differential Revision: D52706238
Pulled By: ajkr
fbshipit-source-id: 2f9f518ec81d1cbde00bd65ab6bd304796836c0a
2024-01-12 22:56:10 +00:00
|
|
|
fopts.use_mmap_reads = soptions_.use_mmap_reads;
|
2021-01-29 06:08:46 +00:00
|
|
|
|
2023-11-30 16:06:37 +00:00
|
|
|
if (magic_number == kCuckooTableMagicNumber) {
|
|
|
|
fopts = soptions_;
|
|
|
|
fopts.temperature = file_temp_;
|
|
|
|
}
|
|
|
|
|
New backup meta schema, with file temperatures (#9660)
Summary:
The primary goal of this change is to add support for backing up and
restoring (applying on restore) file temperature metadata, without
committing to either the DB manifest or the FS reported "current"
temperatures being exclusive "source of truth".
To achieve this goal, we need to add temperature information to backup
metadata, which requires updated backup meta schema. Fortunately I
prepared for this in https://github.com/facebook/rocksdb/issues/8069, which began forward compatibility in version
6.19.0 for this kind of schema update. (Previously, backup meta schema
was not extensible! Making this schema update public will allow some
other "nice to have" features like taking backups with hard links, and
avoiding crc32c checksum computation when another checksum is already
available.) While schema version 2 is newly public, the default schema
version is still 1. Until we change the default, users will need to set
to 2 to enable features like temperature data backup+restore. New
metadata like temperature information will be ignored with a warning
in versions before this change and since 6.19.0. The metadata is
considered ignorable because a functioning DB can be restored without
it.
Some detail:
* Some renaming because "future schema" is now just public schema 2.
* Initialize some atomics in TestFs (linter reported)
* Add temperature hint support to SstFileDumper (used by BackupEngine)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9660
Test Plan:
related unit test majorly updated for the new functionality,
including some shared testing support for tracking temperatures in a FS.
Some other tests and testing hooks into production code also updated for
making the backup meta schema change public.
Reviewed By: ajkr
Differential Revision: D34686968
Pulled By: pdillinger
fbshipit-source-id: 3ac1fa3e67ee97ca8a5103d79cc87d872c1d862a
2022-03-18 18:06:17 +00:00
|
|
|
fs->NewRandomAccessFile(file_path, fopts, &file, nullptr);
|
2021-01-29 06:08:46 +00:00
|
|
|
file_.reset(new RandomAccessFileReader(std::move(file), file_path));
|
2020-06-25 02:30:15 +00:00
|
|
|
}
|
2022-02-08 20:14:25 +00:00
|
|
|
|
2020-06-25 02:30:15 +00:00
|
|
|
// For old sst format, ReadTableProperties might fail but file can be read
|
|
|
|
if (ReadTableProperties(magic_number, file_.get(), file_size,
|
|
|
|
(magic_number == kBlockBasedTableMagicNumber)
|
|
|
|
? &prefetch_buffer
|
|
|
|
: nullptr)
|
|
|
|
.ok()) {
|
2020-09-05 02:25:20 +00:00
|
|
|
s = SetTableOptionsByMagicNumber(magic_number);
|
2022-02-08 20:14:25 +00:00
|
|
|
if (s.ok()) {
|
|
|
|
if (table_properties_ && !table_properties_->comparator_name.empty()) {
|
|
|
|
ConfigOptions config_options;
|
|
|
|
const Comparator* user_comparator = nullptr;
|
|
|
|
s = Comparator::CreateFromString(config_options,
|
|
|
|
table_properties_->comparator_name,
|
|
|
|
&user_comparator);
|
|
|
|
if (s.ok()) {
|
|
|
|
assert(user_comparator);
|
2022-07-12 20:30:35 +00:00
|
|
|
internal_comparator_ = InternalKeyComparator(user_comparator);
|
2022-02-08 20:14:25 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2020-06-25 02:30:15 +00:00
|
|
|
} else {
|
2020-09-05 02:25:20 +00:00
|
|
|
s = SetOldTableOptions();
|
2020-06-25 02:30:15 +00:00
|
|
|
}
|
2022-02-08 20:14:25 +00:00
|
|
|
options_.comparator = internal_comparator_.user_comparator();
|
2020-06-25 02:30:15 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if (s.ok()) {
|
|
|
|
s = NewTableReader(ioptions_, soptions_, internal_comparator_, file_size,
|
|
|
|
&table_reader_);
|
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
Status SstFileDumper::NewTableReader(
|
2021-05-05 20:59:21 +00:00
|
|
|
const ImmutableOptions& /*ioptions*/, const EnvOptions& /*soptions*/,
|
2020-06-25 02:30:15 +00:00
|
|
|
const InternalKeyComparator& /*internal_comparator*/, uint64_t file_size,
|
|
|
|
std::unique_ptr<TableReader>* /*table_reader*/) {
|
2023-04-25 19:08:23 +00:00
|
|
|
auto t_opt = TableReaderOptions(
|
|
|
|
ioptions_, moptions_.prefix_extractor, soptions_, internal_comparator_,
|
|
|
|
0 /* block_protection_bytes_per_key */, false /* skip_filters */,
|
2023-06-22 04:49:01 +00:00
|
|
|
false /* immortal */, true /* force_direct_prefetch */, -1 /* level */,
|
|
|
|
nullptr /* block_cache_tracer */, 0 /* max_file_size_for_l0_meta_pin */,
|
|
|
|
"" /* cur_db_session_id */, 0 /* cur_file_num */, {} /* unique_id */,
|
|
|
|
0 /* largest_seqno */, 0 /* tail_size */,
|
2023-06-22 19:36:22 +00:00
|
|
|
table_properties_ == nullptr
|
|
|
|
? true
|
|
|
|
: static_cast<bool>(
|
|
|
|
table_properties_->user_defined_timestamps_persisted));
|
2020-06-25 02:30:15 +00:00
|
|
|
// Allow open file with global sequence number for backward compatibility.
|
|
|
|
t_opt.largest_seqno = kMaxSequenceNumber;
|
|
|
|
|
|
|
|
// We need to turn off pre-fetching of index and filter nodes for
|
|
|
|
// BlockBasedTable
|
2020-09-14 23:59:00 +00:00
|
|
|
if (options_.table_factory->IsInstanceOf(
|
|
|
|
TableFactory::kBlockBasedTableName())) {
|
2020-06-25 02:30:15 +00:00
|
|
|
return options_.table_factory->NewTableReader(t_opt, std::move(file_),
|
|
|
|
file_size, &table_reader_,
|
|
|
|
/*enable_prefetch=*/false);
|
|
|
|
}
|
|
|
|
|
|
|
|
// For all other factory implementation
|
|
|
|
return options_.table_factory->NewTableReader(t_opt, std::move(file_),
|
|
|
|
file_size, &table_reader_);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status SstFileDumper::VerifyChecksum() {
|
2023-07-05 21:12:06 +00:00
|
|
|
assert(read_options_.verify_checksums);
|
2020-06-25 02:30:15 +00:00
|
|
|
// We could pass specific readahead setting into read options if needed.
|
|
|
|
return table_reader_->VerifyChecksum(read_options_,
|
|
|
|
TableReaderCaller::kSSTDumpTool);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status SstFileDumper::DumpTable(const std::string& out_filename) {
|
|
|
|
std::unique_ptr<WritableFile> out_file;
|
|
|
|
Env* env = options_.env;
|
2020-12-24 00:54:05 +00:00
|
|
|
Status s = env->NewWritableFile(out_filename, &out_file, soptions_);
|
|
|
|
if (s.ok()) {
|
|
|
|
s = table_reader_->DumpTable(out_file.get());
|
|
|
|
}
|
2020-09-05 02:25:20 +00:00
|
|
|
if (!s.ok()) {
|
|
|
|
// close the file before return error, ignore the close error if there's any
|
|
|
|
out_file->Close().PermitUncheckedError();
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
return out_file->Close();
|
2020-06-25 02:30:15 +00:00
|
|
|
}
|
|
|
|
|
2020-09-05 02:25:20 +00:00
|
|
|
Status SstFileDumper::CalculateCompressedTableSize(
|
2020-06-25 02:30:15 +00:00
|
|
|
const TableBuilderOptions& tb_options, size_t block_size,
|
2020-09-05 02:25:20 +00:00
|
|
|
uint64_t* num_data_blocks, uint64_t* compressed_table_size) {
|
2020-06-25 02:30:15 +00:00
|
|
|
std::unique_ptr<Env> env(NewMemEnv(options_.env));
|
2021-01-29 06:08:46 +00:00
|
|
|
std::unique_ptr<WritableFileWriter> dest_writer;
|
|
|
|
Status s =
|
|
|
|
WritableFileWriter::Create(env->GetFileSystem(), testFileName,
|
|
|
|
FileOptions(soptions_), &dest_writer, nullptr);
|
2020-09-05 02:25:20 +00:00
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
2020-06-25 02:30:15 +00:00
|
|
|
BlockBasedTableOptions table_options;
|
|
|
|
table_options.block_size = block_size;
|
|
|
|
BlockBasedTableFactory block_based_tf(table_options);
|
|
|
|
std::unique_ptr<TableBuilder> table_builder;
|
2022-10-25 18:50:38 +00:00
|
|
|
table_builder.reset(
|
|
|
|
block_based_tf.NewTableBuilder(tb_options, dest_writer.get()));
|
2020-06-25 02:30:15 +00:00
|
|
|
std::unique_ptr<InternalIterator> iter(table_reader_->NewIterator(
|
|
|
|
read_options_, moptions_.prefix_extractor.get(), /*arena=*/nullptr,
|
|
|
|
/*skip_filters=*/false, TableReaderCaller::kSSTDumpTool));
|
|
|
|
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
|
|
|
table_builder->Add(iter->key(), iter->value());
|
|
|
|
}
|
2020-09-05 02:25:20 +00:00
|
|
|
s = iter->status();
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
2020-06-25 02:30:15 +00:00
|
|
|
}
|
2020-09-05 02:25:20 +00:00
|
|
|
s = table_builder->Finish();
|
2020-06-25 02:30:15 +00:00
|
|
|
if (!s.ok()) {
|
2020-09-05 02:25:20 +00:00
|
|
|
return s;
|
2020-06-25 02:30:15 +00:00
|
|
|
}
|
2020-09-05 02:25:20 +00:00
|
|
|
*compressed_table_size = table_builder->FileSize();
|
2020-06-25 02:30:15 +00:00
|
|
|
assert(num_data_blocks != nullptr);
|
|
|
|
*num_data_blocks = table_builder->GetTableProperties().num_data_blocks;
|
2020-09-05 02:25:20 +00:00
|
|
|
return env->DeleteFile(testFileName);
|
2020-06-25 02:30:15 +00:00
|
|
|
}
|
|
|
|
|
2020-09-05 02:25:20 +00:00
|
|
|
Status SstFileDumper::ShowAllCompressionSizes(
|
2020-06-25 02:30:15 +00:00
|
|
|
size_t block_size,
|
|
|
|
const std::vector<std::pair<CompressionType, const char*>>&
|
|
|
|
compression_types,
|
2020-09-03 22:48:29 +00:00
|
|
|
int32_t compress_level_from, int32_t compress_level_to,
|
Limit buffering for collecting samples for compression dictionary (#7970)
Summary:
For dictionary compression, we need to collect some representative samples of the data to be compressed, which we use to either generate or train (when `CompressionOptions::zstd_max_train_bytes > 0`) a dictionary. Previously, the strategy was to buffer all the data blocks during flush, and up to the target file size during compaction. That strategy allowed us to randomly pick samples from as wide a range as possible that'd be guaranteed to land in a single output file.
However, some users try to make huge files in memory-constrained environments, where this strategy can cause OOM. This PR introduces an option, `CompressionOptions::max_dict_buffer_bytes`, that limits how much data blocks are buffered before we switch to unbuffered mode (which means creating the per-SST dictionary, writing out the buffered data, and compressing/writing new blocks as soon as they are built). It is not strict as we currently buffer more than just data blocks -- also keys are buffered. But it does make a step towards giving users predictable memory usage.
Related changes include:
- Changed sampling for dictionary compression to select unique data blocks when there is limited availability of data blocks
- Made use of `BlockBuilder::SwapAndReset()` to save an allocation+memcpy when buffering data blocks for building a dictionary
- Changed `ParseBoolean()` to accept an input containing characters after the boolean. This is necessary since, with this PR, a value for `CompressionOptions::enabled` is no longer necessarily the final component in the `CompressionOptions` string.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7970
Test Plan:
- updated `CompressionOptions` unit tests to verify limit is respected (to the extent expected in the current implementation) in various scenarios of flush/compaction to bottommost/non-bottommost level
- looked at jemalloc heap profiles right before and after switching to unbuffered mode during flush/compaction. Verified memory usage in buffering is proportional to the limit set.
Reviewed By: pdillinger
Differential Revision: D26467994
Pulled By: ajkr
fbshipit-source-id: 3da4ef9fba59974e4ef40e40c01611002c861465
2021-02-19 22:06:59 +00:00
|
|
|
uint32_t max_dict_bytes, uint32_t zstd_max_train_bytes,
|
Support using ZDICT_finalizeDictionary to generate zstd dictionary (#9857)
Summary:
An untrained dictionary is currently simply the concatenation of several samples. The ZSTD API, ZDICT_finalizeDictionary(), can improve such a dictionary's effectiveness at low cost. This PR changes how dictionary is created by calling the ZSTD ZDICT_finalizeDictionary() API instead of creating raw content dictionary (when max_dict_buffer_bytes > 0), and pass in all buffered uncompressed data blocks as samples.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9857
Test Plan:
#### db_bench test for cpu/memory of compression+decompression and space saving on synthetic data:
Set up: change the parameter [here](https://github.com/facebook/rocksdb/blob/fb9a167a55e0970b1ef6f67c1600c8d9c4c6114f/tools/db_bench_tool.cc#L1766) to 16384 to make synthetic data more compressible.
```
# linked local ZSTD with version 1.5.2
# DEBUG_LEVEL=0 ROCKSDB_NO_FBCODE=1 ROCKSDB_DISABLE_ZSTD=1 EXTRA_CXXFLAGS="-DZSTD_STATIC_LINKING_ONLY -DZSTD -I/data/users/changyubi/install/include/" EXTRA_LDFLAGS="-L/data/users/changyubi/install/lib/ -l:libzstd.a" make -j32 db_bench
dict_bytes=16384
train_bytes=1048576
echo "========== No Dictionary =========="
TEST_TMPDIR=/dev/shm ./db_bench -benchmarks=filluniquerandom,compact -num=10000000 -compression_type=zstd -compression_max_dict_bytes=0 -block_size=4096 -max_background_jobs=24 -memtablerep=vector -allow_concurrent_memtable_write=false -disable_wal=true -max_write_buffer_number=8 >/dev/null 2>&1
TEST_TMPDIR=/dev/shm /usr/bin/time ./db_bench -use_existing_db=true -benchmarks=compact -compression_type=zstd -compression_max_dict_bytes=0 -block_size=4096 2>&1 | grep elapsed
du -hc /dev/shm/dbbench/*sst | grep total
echo "========== Raw Content Dictionary =========="
TEST_TMPDIR=/dev/shm ./db_bench_main -benchmarks=filluniquerandom,compact -num=10000000 -compression_type=zstd -compression_max_dict_bytes=$dict_bytes -block_size=4096 -max_background_jobs=24 -memtablerep=vector -allow_concurrent_memtable_write=false -disable_wal=true -max_write_buffer_number=8 >/dev/null 2>&1
TEST_TMPDIR=/dev/shm /usr/bin/time ./db_bench_main -use_existing_db=true -benchmarks=compact -compression_type=zstd -compression_max_dict_bytes=$dict_bytes -block_size=4096 2>&1 | grep elapsed
du -hc /dev/shm/dbbench/*sst | grep total
echo "========== FinalizeDictionary =========="
TEST_TMPDIR=/dev/shm ./db_bench -benchmarks=filluniquerandom,compact -num=10000000 -compression_type=zstd -compression_max_dict_bytes=$dict_bytes -compression_zstd_max_train_bytes=$train_bytes -compression_use_zstd_dict_trainer=false -block_size=4096 -max_background_jobs=24 -memtablerep=vector -allow_concurrent_memtable_write=false -disable_wal=true -max_write_buffer_number=8 >/dev/null 2>&1
TEST_TMPDIR=/dev/shm /usr/bin/time ./db_bench -use_existing_db=true -benchmarks=compact -compression_type=zstd -compression_max_dict_bytes=$dict_bytes -compression_zstd_max_train_bytes=$train_bytes -compression_use_zstd_dict_trainer=false -block_size=4096 2>&1 | grep elapsed
du -hc /dev/shm/dbbench/*sst | grep total
echo "========== TrainDictionary =========="
TEST_TMPDIR=/dev/shm ./db_bench -benchmarks=filluniquerandom,compact -num=10000000 -compression_type=zstd -compression_max_dict_bytes=$dict_bytes -compression_zstd_max_train_bytes=$train_bytes -block_size=4096 -max_background_jobs=24 -memtablerep=vector -allow_concurrent_memtable_write=false -disable_wal=true -max_write_buffer_number=8 >/dev/null 2>&1
TEST_TMPDIR=/dev/shm /usr/bin/time ./db_bench -use_existing_db=true -benchmarks=compact -compression_type=zstd -compression_max_dict_bytes=$dict_bytes -compression_zstd_max_train_bytes=$train_bytes -block_size=4096 2>&1 | grep elapsed
du -hc /dev/shm/dbbench/*sst | grep total
# Result: TrainDictionary is much better on space saving, but FinalizeDictionary seems to use less memory.
# before compression data size: 1.2GB
dict_bytes=16384
max_dict_buffer_bytes = 1048576
space cpu/memory
No Dictionary 468M 14.93user 1.00system 0:15.92elapsed 100%CPU (0avgtext+0avgdata 23904maxresident)k
Raw Dictionary 251M 15.81user 0.80system 0:16.56elapsed 100%CPU (0avgtext+0avgdata 156808maxresident)k
FinalizeDictionary 236M 11.93user 0.64system 0:12.56elapsed 100%CPU (0avgtext+0avgdata 89548maxresident)k
TrainDictionary 84M 7.29user 0.45system 0:07.75elapsed 100%CPU (0avgtext+0avgdata 97288maxresident)k
```
#### Benchmark on 10 sample SST files for spacing saving and CPU time on compression:
FinalizeDictionary is comparable to TrainDictionary in terms of space saving, and takes less time in compression.
```
dict_bytes=16384
train_bytes=1048576
for sst_file in `ls ../temp/myrock-sst/`
do
echo "********** $sst_file **********"
echo "========== No Dictionary =========="
./sst_dump --file="../temp/myrock-sst/$sst_file" --command=recompress --compression_level_from=6 --compression_level_to=6 --compression_types=kZSTD
echo "========== Raw Content Dictionary =========="
./sst_dump --file="../temp/myrock-sst/$sst_file" --command=recompress --compression_level_from=6 --compression_level_to=6 --compression_types=kZSTD --compression_max_dict_bytes=$dict_bytes
echo "========== FinalizeDictionary =========="
./sst_dump --file="../temp/myrock-sst/$sst_file" --command=recompress --compression_level_from=6 --compression_level_to=6 --compression_types=kZSTD --compression_max_dict_bytes=$dict_bytes --compression_zstd_max_train_bytes=$train_bytes --compression_use_zstd_finalize_dict
echo "========== TrainDictionary =========="
./sst_dump --file="../temp/myrock-sst/$sst_file" --command=recompress --compression_level_from=6 --compression_level_to=6 --compression_types=kZSTD --compression_max_dict_bytes=$dict_bytes --compression_zstd_max_train_bytes=$train_bytes
done
010240.sst (Size/Time) 011029.sst 013184.sst 021552.sst 185054.sst 185137.sst 191666.sst 7560381.sst 7604174.sst 7635312.sst
No Dictionary 28165569 / 2614419 32899411 / 2976832 32977848 / 3055542 31966329 / 2004590 33614351 / 1755877 33429029 / 1717042 33611933 / 1776936 33634045 / 2771417 33789721 / 2205414 33592194 / 388254
Raw Content Dictionary 28019950 / 2697961 33748665 / 3572422 33896373 / 3534701 26418431 / 2259658 28560825 / 1839168 28455030 / 1846039 28494319 / 1861349 32391599 / 3095649 33772142 / 2407843 33592230 / 474523
FinalizeDictionary 27896012 / 2650029 33763886 / 3719427 33904283 / 3552793 26008225 / 2198033 28111872 / 1869530 28014374 / 1789771 28047706 / 1848300 32296254 / 3204027 33698698 / 2381468 33592344 / 517433
TrainDictionary 28046089 / 2740037 33706480 / 3679019 33885741 / 3629351 25087123 / 2204558 27194353 / 1970207 27234229 / 1896811 27166710 / 1903119 32011041 / 3322315 32730692 / 2406146 33608631 / 570593
```
#### Decompression/Read test:
With FinalizeDictionary/TrainDictionary, some data structure used for decompression are in stored in dictionary, so they are expected to be faster in terms of decompression/reads.
```
dict_bytes=16384
train_bytes=1048576
echo "No Dictionary"
TEST_TMPDIR=/dev/shm/ ./db_bench -benchmarks=filluniquerandom,compact -compression_type=zstd -compression_max_dict_bytes=0 > /dev/null 2>&1
TEST_TMPDIR=/dev/shm/ ./db_bench -use_existing_db=true -benchmarks=readrandom -cache_size=0 -compression_type=zstd -compression_max_dict_bytes=0 2>&1 | grep MB/s
echo "Raw Dictionary"
TEST_TMPDIR=/dev/shm/ ./db_bench -benchmarks=filluniquerandom,compact -compression_type=zstd -compression_max_dict_bytes=$dict_bytes > /dev/null 2>&1
TEST_TMPDIR=/dev/shm/ ./db_bench -use_existing_db=true -benchmarks=readrandom -cache_size=0 -compression_type=zstd -compression_max_dict_bytes=$dict_bytes 2>&1 | grep MB/s
echo "FinalizeDict"
TEST_TMPDIR=/dev/shm/ ./db_bench -benchmarks=filluniquerandom,compact -compression_type=zstd -compression_max_dict_bytes=$dict_bytes -compression_zstd_max_train_bytes=$train_bytes -compression_use_zstd_dict_trainer=false > /dev/null 2>&1
TEST_TMPDIR=/dev/shm/ ./db_bench -use_existing_db=true -benchmarks=readrandom -cache_size=0 -compression_type=zstd -compression_max_dict_bytes=$dict_bytes -compression_zstd_max_train_bytes=$train_bytes -compression_use_zstd_dict_trainer=false 2>&1 | grep MB/s
echo "Train Dictionary"
TEST_TMPDIR=/dev/shm/ ./db_bench -benchmarks=filluniquerandom,compact -compression_type=zstd -compression_max_dict_bytes=$dict_bytes -compression_zstd_max_train_bytes=$train_bytes > /dev/null 2>&1
TEST_TMPDIR=/dev/shm/ ./db_bench -use_existing_db=true -benchmarks=readrandom -cache_size=0 -compression_type=zstd -compression_max_dict_bytes=$dict_bytes -compression_zstd_max_train_bytes=$train_bytes 2>&1 | grep MB/s
No Dictionary
readrandom : 12.183 micros/op 82082 ops/sec 12.183 seconds 1000000 operations; 9.1 MB/s (1000000 of 1000000 found)
Raw Dictionary
readrandom : 12.314 micros/op 81205 ops/sec 12.314 seconds 1000000 operations; 9.0 MB/s (1000000 of 1000000 found)
FinalizeDict
readrandom : 9.787 micros/op 102180 ops/sec 9.787 seconds 1000000 operations; 11.3 MB/s (1000000 of 1000000 found)
Train Dictionary
readrandom : 9.698 micros/op 103108 ops/sec 9.699 seconds 1000000 operations; 11.4 MB/s (1000000 of 1000000 found)
```
Reviewed By: ajkr
Differential Revision: D35720026
Pulled By: cbi42
fbshipit-source-id: 24d230fdff0fd28a1bb650658798f00dfcfb2a1f
2022-05-20 19:09:09 +00:00
|
|
|
uint64_t max_dict_buffer_bytes, bool use_zstd_dict_trainer) {
|
2020-06-25 02:30:15 +00:00
|
|
|
fprintf(stdout, "Block Size: %" ROCKSDB_PRIszt "\n", block_size);
|
|
|
|
for (auto& i : compression_types) {
|
|
|
|
if (CompressionTypeSupported(i.first)) {
|
|
|
|
fprintf(stdout, "Compression: %-24s\n", i.second);
|
|
|
|
CompressionOptions compress_opt;
|
2020-09-03 22:48:29 +00:00
|
|
|
compress_opt.max_dict_bytes = max_dict_bytes;
|
|
|
|
compress_opt.zstd_max_train_bytes = zstd_max_train_bytes;
|
Limit buffering for collecting samples for compression dictionary (#7970)
Summary:
For dictionary compression, we need to collect some representative samples of the data to be compressed, which we use to either generate or train (when `CompressionOptions::zstd_max_train_bytes > 0`) a dictionary. Previously, the strategy was to buffer all the data blocks during flush, and up to the target file size during compaction. That strategy allowed us to randomly pick samples from as wide a range as possible that'd be guaranteed to land in a single output file.
However, some users try to make huge files in memory-constrained environments, where this strategy can cause OOM. This PR introduces an option, `CompressionOptions::max_dict_buffer_bytes`, that limits how much data blocks are buffered before we switch to unbuffered mode (which means creating the per-SST dictionary, writing out the buffered data, and compressing/writing new blocks as soon as they are built). It is not strict as we currently buffer more than just data blocks -- also keys are buffered. But it does make a step towards giving users predictable memory usage.
Related changes include:
- Changed sampling for dictionary compression to select unique data blocks when there is limited availability of data blocks
- Made use of `BlockBuilder::SwapAndReset()` to save an allocation+memcpy when buffering data blocks for building a dictionary
- Changed `ParseBoolean()` to accept an input containing characters after the boolean. This is necessary since, with this PR, a value for `CompressionOptions::enabled` is no longer necessarily the final component in the `CompressionOptions` string.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7970
Test Plan:
- updated `CompressionOptions` unit tests to verify limit is respected (to the extent expected in the current implementation) in various scenarios of flush/compaction to bottommost/non-bottommost level
- looked at jemalloc heap profiles right before and after switching to unbuffered mode during flush/compaction. Verified memory usage in buffering is proportional to the limit set.
Reviewed By: pdillinger
Differential Revision: D26467994
Pulled By: ajkr
fbshipit-source-id: 3da4ef9fba59974e4ef40e40c01611002c861465
2021-02-19 22:06:59 +00:00
|
|
|
compress_opt.max_dict_buffer_bytes = max_dict_buffer_bytes;
|
Support using ZDICT_finalizeDictionary to generate zstd dictionary (#9857)
Summary:
An untrained dictionary is currently simply the concatenation of several samples. The ZSTD API, ZDICT_finalizeDictionary(), can improve such a dictionary's effectiveness at low cost. This PR changes how dictionary is created by calling the ZSTD ZDICT_finalizeDictionary() API instead of creating raw content dictionary (when max_dict_buffer_bytes > 0), and pass in all buffered uncompressed data blocks as samples.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9857
Test Plan:
#### db_bench test for cpu/memory of compression+decompression and space saving on synthetic data:
Set up: change the parameter [here](https://github.com/facebook/rocksdb/blob/fb9a167a55e0970b1ef6f67c1600c8d9c4c6114f/tools/db_bench_tool.cc#L1766) to 16384 to make synthetic data more compressible.
```
# linked local ZSTD with version 1.5.2
# DEBUG_LEVEL=0 ROCKSDB_NO_FBCODE=1 ROCKSDB_DISABLE_ZSTD=1 EXTRA_CXXFLAGS="-DZSTD_STATIC_LINKING_ONLY -DZSTD -I/data/users/changyubi/install/include/" EXTRA_LDFLAGS="-L/data/users/changyubi/install/lib/ -l:libzstd.a" make -j32 db_bench
dict_bytes=16384
train_bytes=1048576
echo "========== No Dictionary =========="
TEST_TMPDIR=/dev/shm ./db_bench -benchmarks=filluniquerandom,compact -num=10000000 -compression_type=zstd -compression_max_dict_bytes=0 -block_size=4096 -max_background_jobs=24 -memtablerep=vector -allow_concurrent_memtable_write=false -disable_wal=true -max_write_buffer_number=8 >/dev/null 2>&1
TEST_TMPDIR=/dev/shm /usr/bin/time ./db_bench -use_existing_db=true -benchmarks=compact -compression_type=zstd -compression_max_dict_bytes=0 -block_size=4096 2>&1 | grep elapsed
du -hc /dev/shm/dbbench/*sst | grep total
echo "========== Raw Content Dictionary =========="
TEST_TMPDIR=/dev/shm ./db_bench_main -benchmarks=filluniquerandom,compact -num=10000000 -compression_type=zstd -compression_max_dict_bytes=$dict_bytes -block_size=4096 -max_background_jobs=24 -memtablerep=vector -allow_concurrent_memtable_write=false -disable_wal=true -max_write_buffer_number=8 >/dev/null 2>&1
TEST_TMPDIR=/dev/shm /usr/bin/time ./db_bench_main -use_existing_db=true -benchmarks=compact -compression_type=zstd -compression_max_dict_bytes=$dict_bytes -block_size=4096 2>&1 | grep elapsed
du -hc /dev/shm/dbbench/*sst | grep total
echo "========== FinalizeDictionary =========="
TEST_TMPDIR=/dev/shm ./db_bench -benchmarks=filluniquerandom,compact -num=10000000 -compression_type=zstd -compression_max_dict_bytes=$dict_bytes -compression_zstd_max_train_bytes=$train_bytes -compression_use_zstd_dict_trainer=false -block_size=4096 -max_background_jobs=24 -memtablerep=vector -allow_concurrent_memtable_write=false -disable_wal=true -max_write_buffer_number=8 >/dev/null 2>&1
TEST_TMPDIR=/dev/shm /usr/bin/time ./db_bench -use_existing_db=true -benchmarks=compact -compression_type=zstd -compression_max_dict_bytes=$dict_bytes -compression_zstd_max_train_bytes=$train_bytes -compression_use_zstd_dict_trainer=false -block_size=4096 2>&1 | grep elapsed
du -hc /dev/shm/dbbench/*sst | grep total
echo "========== TrainDictionary =========="
TEST_TMPDIR=/dev/shm ./db_bench -benchmarks=filluniquerandom,compact -num=10000000 -compression_type=zstd -compression_max_dict_bytes=$dict_bytes -compression_zstd_max_train_bytes=$train_bytes -block_size=4096 -max_background_jobs=24 -memtablerep=vector -allow_concurrent_memtable_write=false -disable_wal=true -max_write_buffer_number=8 >/dev/null 2>&1
TEST_TMPDIR=/dev/shm /usr/bin/time ./db_bench -use_existing_db=true -benchmarks=compact -compression_type=zstd -compression_max_dict_bytes=$dict_bytes -compression_zstd_max_train_bytes=$train_bytes -block_size=4096 2>&1 | grep elapsed
du -hc /dev/shm/dbbench/*sst | grep total
# Result: TrainDictionary is much better on space saving, but FinalizeDictionary seems to use less memory.
# before compression data size: 1.2GB
dict_bytes=16384
max_dict_buffer_bytes = 1048576
space cpu/memory
No Dictionary 468M 14.93user 1.00system 0:15.92elapsed 100%CPU (0avgtext+0avgdata 23904maxresident)k
Raw Dictionary 251M 15.81user 0.80system 0:16.56elapsed 100%CPU (0avgtext+0avgdata 156808maxresident)k
FinalizeDictionary 236M 11.93user 0.64system 0:12.56elapsed 100%CPU (0avgtext+0avgdata 89548maxresident)k
TrainDictionary 84M 7.29user 0.45system 0:07.75elapsed 100%CPU (0avgtext+0avgdata 97288maxresident)k
```
#### Benchmark on 10 sample SST files for spacing saving and CPU time on compression:
FinalizeDictionary is comparable to TrainDictionary in terms of space saving, and takes less time in compression.
```
dict_bytes=16384
train_bytes=1048576
for sst_file in `ls ../temp/myrock-sst/`
do
echo "********** $sst_file **********"
echo "========== No Dictionary =========="
./sst_dump --file="../temp/myrock-sst/$sst_file" --command=recompress --compression_level_from=6 --compression_level_to=6 --compression_types=kZSTD
echo "========== Raw Content Dictionary =========="
./sst_dump --file="../temp/myrock-sst/$sst_file" --command=recompress --compression_level_from=6 --compression_level_to=6 --compression_types=kZSTD --compression_max_dict_bytes=$dict_bytes
echo "========== FinalizeDictionary =========="
./sst_dump --file="../temp/myrock-sst/$sst_file" --command=recompress --compression_level_from=6 --compression_level_to=6 --compression_types=kZSTD --compression_max_dict_bytes=$dict_bytes --compression_zstd_max_train_bytes=$train_bytes --compression_use_zstd_finalize_dict
echo "========== TrainDictionary =========="
./sst_dump --file="../temp/myrock-sst/$sst_file" --command=recompress --compression_level_from=6 --compression_level_to=6 --compression_types=kZSTD --compression_max_dict_bytes=$dict_bytes --compression_zstd_max_train_bytes=$train_bytes
done
010240.sst (Size/Time) 011029.sst 013184.sst 021552.sst 185054.sst 185137.sst 191666.sst 7560381.sst 7604174.sst 7635312.sst
No Dictionary 28165569 / 2614419 32899411 / 2976832 32977848 / 3055542 31966329 / 2004590 33614351 / 1755877 33429029 / 1717042 33611933 / 1776936 33634045 / 2771417 33789721 / 2205414 33592194 / 388254
Raw Content Dictionary 28019950 / 2697961 33748665 / 3572422 33896373 / 3534701 26418431 / 2259658 28560825 / 1839168 28455030 / 1846039 28494319 / 1861349 32391599 / 3095649 33772142 / 2407843 33592230 / 474523
FinalizeDictionary 27896012 / 2650029 33763886 / 3719427 33904283 / 3552793 26008225 / 2198033 28111872 / 1869530 28014374 / 1789771 28047706 / 1848300 32296254 / 3204027 33698698 / 2381468 33592344 / 517433
TrainDictionary 28046089 / 2740037 33706480 / 3679019 33885741 / 3629351 25087123 / 2204558 27194353 / 1970207 27234229 / 1896811 27166710 / 1903119 32011041 / 3322315 32730692 / 2406146 33608631 / 570593
```
#### Decompression/Read test:
With FinalizeDictionary/TrainDictionary, some data structure used for decompression are in stored in dictionary, so they are expected to be faster in terms of decompression/reads.
```
dict_bytes=16384
train_bytes=1048576
echo "No Dictionary"
TEST_TMPDIR=/dev/shm/ ./db_bench -benchmarks=filluniquerandom,compact -compression_type=zstd -compression_max_dict_bytes=0 > /dev/null 2>&1
TEST_TMPDIR=/dev/shm/ ./db_bench -use_existing_db=true -benchmarks=readrandom -cache_size=0 -compression_type=zstd -compression_max_dict_bytes=0 2>&1 | grep MB/s
echo "Raw Dictionary"
TEST_TMPDIR=/dev/shm/ ./db_bench -benchmarks=filluniquerandom,compact -compression_type=zstd -compression_max_dict_bytes=$dict_bytes > /dev/null 2>&1
TEST_TMPDIR=/dev/shm/ ./db_bench -use_existing_db=true -benchmarks=readrandom -cache_size=0 -compression_type=zstd -compression_max_dict_bytes=$dict_bytes 2>&1 | grep MB/s
echo "FinalizeDict"
TEST_TMPDIR=/dev/shm/ ./db_bench -benchmarks=filluniquerandom,compact -compression_type=zstd -compression_max_dict_bytes=$dict_bytes -compression_zstd_max_train_bytes=$train_bytes -compression_use_zstd_dict_trainer=false > /dev/null 2>&1
TEST_TMPDIR=/dev/shm/ ./db_bench -use_existing_db=true -benchmarks=readrandom -cache_size=0 -compression_type=zstd -compression_max_dict_bytes=$dict_bytes -compression_zstd_max_train_bytes=$train_bytes -compression_use_zstd_dict_trainer=false 2>&1 | grep MB/s
echo "Train Dictionary"
TEST_TMPDIR=/dev/shm/ ./db_bench -benchmarks=filluniquerandom,compact -compression_type=zstd -compression_max_dict_bytes=$dict_bytes -compression_zstd_max_train_bytes=$train_bytes > /dev/null 2>&1
TEST_TMPDIR=/dev/shm/ ./db_bench -use_existing_db=true -benchmarks=readrandom -cache_size=0 -compression_type=zstd -compression_max_dict_bytes=$dict_bytes -compression_zstd_max_train_bytes=$train_bytes 2>&1 | grep MB/s
No Dictionary
readrandom : 12.183 micros/op 82082 ops/sec 12.183 seconds 1000000 operations; 9.1 MB/s (1000000 of 1000000 found)
Raw Dictionary
readrandom : 12.314 micros/op 81205 ops/sec 12.314 seconds 1000000 operations; 9.0 MB/s (1000000 of 1000000 found)
FinalizeDict
readrandom : 9.787 micros/op 102180 ops/sec 9.787 seconds 1000000 operations; 11.3 MB/s (1000000 of 1000000 found)
Train Dictionary
readrandom : 9.698 micros/op 103108 ops/sec 9.699 seconds 1000000 operations; 11.4 MB/s (1000000 of 1000000 found)
```
Reviewed By: ajkr
Differential Revision: D35720026
Pulled By: cbi42
fbshipit-source-id: 24d230fdff0fd28a1bb650658798f00dfcfb2a1f
2022-05-20 19:09:09 +00:00
|
|
|
compress_opt.use_zstd_dict_trainer = use_zstd_dict_trainer;
|
2020-06-25 02:30:15 +00:00
|
|
|
for (int32_t j = compress_level_from; j <= compress_level_to; j++) {
|
|
|
|
fprintf(stdout, "Compression level: %d", j);
|
|
|
|
compress_opt.level = j;
|
2020-09-05 02:25:20 +00:00
|
|
|
Status s = ShowCompressionSize(block_size, i.first, compress_opt);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
2020-06-25 02:30:15 +00:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
fprintf(stdout, "Unsupported compression type: %s.\n", i.second);
|
|
|
|
}
|
|
|
|
}
|
2020-09-05 02:25:20 +00:00
|
|
|
return Status::OK();
|
2020-06-25 02:30:15 +00:00
|
|
|
}
|
|
|
|
|
2020-09-05 02:25:20 +00:00
|
|
|
Status SstFileDumper::ShowCompressionSize(
|
|
|
|
size_t block_size, CompressionType compress_type,
|
|
|
|
const CompressionOptions& compress_opt) {
|
2020-06-25 02:30:15 +00:00
|
|
|
Options opts;
|
|
|
|
opts.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
|
|
|
|
opts.statistics->set_stats_level(StatsLevel::kAll);
|
2021-05-05 20:59:21 +00:00
|
|
|
const ImmutableOptions imoptions(opts);
|
2020-06-25 02:30:15 +00:00
|
|
|
const ColumnFamilyOptions cfo(opts);
|
|
|
|
const MutableCFOptions moptions(cfo);
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
// TODO: plumb Env::IOActivity, Env::IOPriority
|
|
|
|
const ReadOptions read_options;
|
|
|
|
const WriteOptions write_options;
|
2020-06-25 02:30:15 +00:00
|
|
|
ROCKSDB_NAMESPACE::InternalKeyComparator ikc(opts.comparator);
|
2024-02-02 22:14:43 +00:00
|
|
|
InternalTblPropCollFactories block_based_table_factories;
|
2020-06-25 02:30:15 +00:00
|
|
|
|
|
|
|
std::string column_family_name;
|
|
|
|
int unknown_level = -1;
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
|
2021-04-29 13:59:53 +00:00
|
|
|
TableBuilderOptions tb_opts(
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
imoptions, moptions, read_options, write_options, ikc,
|
|
|
|
&block_based_table_factories, compress_type, compress_opt,
|
2021-04-29 13:59:53 +00:00
|
|
|
TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
|
|
|
|
column_family_name, unknown_level);
|
2020-06-25 02:30:15 +00:00
|
|
|
uint64_t num_data_blocks = 0;
|
|
|
|
std::chrono::steady_clock::time_point start =
|
|
|
|
std::chrono::steady_clock::now();
|
2020-09-05 02:25:20 +00:00
|
|
|
uint64_t file_size;
|
|
|
|
Status s = CalculateCompressedTableSize(tb_opts, block_size, &num_data_blocks,
|
|
|
|
&file_size);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2020-06-25 02:30:15 +00:00
|
|
|
std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();
|
|
|
|
fprintf(stdout, " Size: %10" PRIu64, file_size);
|
|
|
|
fprintf(stdout, " Blocks: %6" PRIu64, num_data_blocks);
|
|
|
|
fprintf(stdout, " Time Taken: %10s microsecs",
|
|
|
|
std::to_string(
|
|
|
|
std::chrono::duration_cast<std::chrono::microseconds>(end - start)
|
|
|
|
.count())
|
|
|
|
.c_str());
|
|
|
|
const uint64_t compressed_blocks =
|
|
|
|
opts.statistics->getAndResetTickerCount(NUMBER_BLOCK_COMPRESSED);
|
|
|
|
const uint64_t not_compressed_blocks =
|
2023-04-22 04:57:40 +00:00
|
|
|
opts.statistics->getAndResetTickerCount(
|
|
|
|
NUMBER_BLOCK_COMPRESSION_REJECTED);
|
2020-06-25 02:30:15 +00:00
|
|
|
// When the option enable_index_compression is true,
|
|
|
|
// NUMBER_BLOCK_COMPRESSED is incremented for index block(s).
|
|
|
|
if ((compressed_blocks + not_compressed_blocks) > num_data_blocks) {
|
|
|
|
num_data_blocks = compressed_blocks + not_compressed_blocks;
|
|
|
|
}
|
|
|
|
|
|
|
|
const uint64_t ratio_not_compressed_blocks =
|
|
|
|
(num_data_blocks - compressed_blocks) - not_compressed_blocks;
|
|
|
|
const double compressed_pcnt =
|
|
|
|
(0 == num_data_blocks) ? 0.0
|
|
|
|
: ((static_cast<double>(compressed_blocks) /
|
|
|
|
static_cast<double>(num_data_blocks)) *
|
|
|
|
100.0);
|
|
|
|
const double ratio_not_compressed_pcnt =
|
|
|
|
(0 == num_data_blocks)
|
|
|
|
? 0.0
|
|
|
|
: ((static_cast<double>(ratio_not_compressed_blocks) /
|
|
|
|
static_cast<double>(num_data_blocks)) *
|
|
|
|
100.0);
|
|
|
|
const double not_compressed_pcnt =
|
|
|
|
(0 == num_data_blocks) ? 0.0
|
|
|
|
: ((static_cast<double>(not_compressed_blocks) /
|
|
|
|
static_cast<double>(num_data_blocks)) *
|
|
|
|
100.0);
|
|
|
|
fprintf(stdout, " Compressed: %6" PRIu64 " (%5.1f%%)", compressed_blocks,
|
|
|
|
compressed_pcnt);
|
|
|
|
fprintf(stdout, " Not compressed (ratio): %6" PRIu64 " (%5.1f%%)",
|
|
|
|
ratio_not_compressed_blocks, ratio_not_compressed_pcnt);
|
|
|
|
fprintf(stdout, " Not compressed (abort): %6" PRIu64 " (%5.1f%%)\n",
|
|
|
|
not_compressed_blocks, not_compressed_pcnt);
|
2020-09-05 02:25:20 +00:00
|
|
|
return Status::OK();
|
2020-06-25 02:30:15 +00:00
|
|
|
}
|
|
|
|
|
Improve / clean up meta block code & integrity (#9163)
Summary:
* Checksums are now checked on meta blocks unless specifically
suppressed or not applicable (e.g. plain table). (Was other way around.)
This means a number of cases that were not checking checksums now are,
including direct read TableProperties in Version::GetTableProperties
(fixed in meta_blocks ReadTableProperties), reading any block from
PersistentCache (fixed in BlockFetcher), read TableProperties in
SstFileDumper (ldb/sst_dump/BackupEngine) before table reader open,
maybe more.
* For that to work, I moved the global_seqno+TableProperties checksum
logic to the shared table/ code, because that is used by many utilies
such as SstFileDumper.
* Also for that to work, we have to know when we're dealing with a block
that has a checksum (trailer), so added that capability to Footer based
on magic number, and from there BlockFetcher.
* Knowledge of trailer presence has also fixed a problem where other
table formats were reading blocks including bytes for a non-existant
trailer--and awkwardly kind-of not using them, e.g. no shared code
checking checksums. (BlockFetcher compression type was populated
incorrectly.) Now we only read what is needed.
* Minimized code duplication and differing/incompatible/awkward
abstractions in meta_blocks.{cc,h} (e.g. SeekTo in metaindex block
without parsing block handle)
* Moved some meta block handling code from table_properties*.*
* Moved some code specific to block-based table from shared table/ code
to BlockBasedTable class. The checksum stuff means we can't completely
separate it, but things that don't need to be in shared table/ code
should not be.
* Use unique_ptr rather than raw ptr in more places. (Note: you can
std::move from unique_ptr to shared_ptr.)
Without enhancements to GetPropertiesOfAllTablesTest (see below),
net reduction of roughly 100 lines of code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9163
Test Plan:
existing tests and
* Enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to verify that
checksums are now checked on direct read of table properties by TableCache
(new test would fail before this change)
* Also enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to test
putting table properties under old meta name
* Also generally enhanced that same test to actually test what it was
supposed to be testing already, by kicking things out of table cache when
we don't want them there.
Reviewed By: ajkr, mrambacher
Differential Revision: D32514757
Pulled By: pdillinger
fbshipit-source-id: 507964b9311d186ae8d1131182290cbd97a99fa9
2021-11-18 19:42:12 +00:00
|
|
|
// Reads TableProperties prior to opening table reader in order to set up
|
|
|
|
// options.
|
2020-06-25 02:30:15 +00:00
|
|
|
Status SstFileDumper::ReadTableProperties(uint64_t table_magic_number,
|
|
|
|
RandomAccessFileReader* file,
|
|
|
|
uint64_t file_size,
|
|
|
|
FilePrefetchBuffer* prefetch_buffer) {
|
|
|
|
Status s = ROCKSDB_NAMESPACE::ReadTableProperties(
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
file, file_size, table_magic_number, ioptions_, read_options_,
|
2023-04-21 16:07:18 +00:00
|
|
|
&table_properties_,
|
2020-06-25 02:30:15 +00:00
|
|
|
/* memory_allocator= */ nullptr, prefetch_buffer);
|
Improve / clean up meta block code & integrity (#9163)
Summary:
* Checksums are now checked on meta blocks unless specifically
suppressed or not applicable (e.g. plain table). (Was other way around.)
This means a number of cases that were not checking checksums now are,
including direct read TableProperties in Version::GetTableProperties
(fixed in meta_blocks ReadTableProperties), reading any block from
PersistentCache (fixed in BlockFetcher), read TableProperties in
SstFileDumper (ldb/sst_dump/BackupEngine) before table reader open,
maybe more.
* For that to work, I moved the global_seqno+TableProperties checksum
logic to the shared table/ code, because that is used by many utilies
such as SstFileDumper.
* Also for that to work, we have to know when we're dealing with a block
that has a checksum (trailer), so added that capability to Footer based
on magic number, and from there BlockFetcher.
* Knowledge of trailer presence has also fixed a problem where other
table formats were reading blocks including bytes for a non-existant
trailer--and awkwardly kind-of not using them, e.g. no shared code
checking checksums. (BlockFetcher compression type was populated
incorrectly.) Now we only read what is needed.
* Minimized code duplication and differing/incompatible/awkward
abstractions in meta_blocks.{cc,h} (e.g. SeekTo in metaindex block
without parsing block handle)
* Moved some meta block handling code from table_properties*.*
* Moved some code specific to block-based table from shared table/ code
to BlockBasedTable class. The checksum stuff means we can't completely
separate it, but things that don't need to be in shared table/ code
should not be.
* Use unique_ptr rather than raw ptr in more places. (Note: you can
std::move from unique_ptr to shared_ptr.)
Without enhancements to GetPropertiesOfAllTablesTest (see below),
net reduction of roughly 100 lines of code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9163
Test Plan:
existing tests and
* Enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to verify that
checksums are now checked on direct read of table properties by TableCache
(new test would fail before this change)
* Also enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to test
putting table properties under old meta name
* Also generally enhanced that same test to actually test what it was
supposed to be testing already, by kicking things out of table cache when
we don't want them there.
Reviewed By: ajkr, mrambacher
Differential Revision: D32514757
Pulled By: pdillinger
fbshipit-source-id: 507964b9311d186ae8d1131182290cbd97a99fa9
2021-11-18 19:42:12 +00:00
|
|
|
if (!s.ok()) {
|
2020-06-25 02:30:15 +00:00
|
|
|
if (!silent_) {
|
|
|
|
fprintf(stdout, "Not able to read table properties\n");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
Status SstFileDumper::SetTableOptionsByMagicNumber(
|
|
|
|
uint64_t table_magic_number) {
|
|
|
|
assert(table_properties_);
|
|
|
|
if (table_magic_number == kBlockBasedTableMagicNumber ||
|
|
|
|
table_magic_number == kLegacyBlockBasedTableMagicNumber) {
|
|
|
|
BlockBasedTableFactory* bbtf = new BlockBasedTableFactory();
|
|
|
|
// To force tail prefetching, we fake reporting two useful reads of 512KB
|
|
|
|
// from the tail.
|
|
|
|
// It needs at least two data points to warm up the stats.
|
|
|
|
bbtf->tail_prefetch_stats()->RecordEffectiveSize(512 * 1024);
|
|
|
|
bbtf->tail_prefetch_stats()->RecordEffectiveSize(512 * 1024);
|
|
|
|
|
|
|
|
options_.table_factory.reset(bbtf);
|
|
|
|
if (!silent_) {
|
|
|
|
fprintf(stdout, "Sst file format: block-based\n");
|
|
|
|
}
|
|
|
|
|
|
|
|
auto& props = table_properties_->user_collected_properties;
|
|
|
|
auto pos = props.find(BlockBasedTablePropertyNames::kIndexType);
|
|
|
|
if (pos != props.end()) {
|
|
|
|
auto index_type_on_file = static_cast<BlockBasedTableOptions::IndexType>(
|
|
|
|
DecodeFixed32(pos->second.c_str()));
|
|
|
|
if (index_type_on_file ==
|
|
|
|
BlockBasedTableOptions::IndexType::kHashSearch) {
|
|
|
|
options_.prefix_extractor.reset(NewNoopTransform());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else if (table_magic_number == kPlainTableMagicNumber ||
|
|
|
|
table_magic_number == kLegacyPlainTableMagicNumber) {
|
|
|
|
options_.allow_mmap_reads = true;
|
|
|
|
|
|
|
|
PlainTableOptions plain_table_options;
|
|
|
|
plain_table_options.user_key_len = kPlainTableVariableLength;
|
|
|
|
plain_table_options.bloom_bits_per_key = 0;
|
|
|
|
plain_table_options.hash_table_ratio = 0;
|
|
|
|
plain_table_options.index_sparseness = 1;
|
|
|
|
plain_table_options.huge_page_tlb_size = 0;
|
|
|
|
plain_table_options.encoding_type = kPlain;
|
|
|
|
plain_table_options.full_scan_mode = true;
|
|
|
|
|
|
|
|
options_.table_factory.reset(NewPlainTableFactory(plain_table_options));
|
|
|
|
if (!silent_) {
|
|
|
|
fprintf(stdout, "Sst file format: plain table\n");
|
|
|
|
}
|
2023-11-30 16:06:37 +00:00
|
|
|
} else if (table_magic_number == kCuckooTableMagicNumber) {
|
|
|
|
ioptions_.allow_mmap_reads = true;
|
|
|
|
|
|
|
|
options_.table_factory.reset(NewCuckooTableFactory());
|
|
|
|
if (!silent_) {
|
|
|
|
fprintf(stdout, "Sst file format: cuckoo table\n");
|
|
|
|
}
|
2020-06-25 02:30:15 +00:00
|
|
|
} else {
|
|
|
|
char error_msg_buffer[80];
|
|
|
|
snprintf(error_msg_buffer, sizeof(error_msg_buffer) - 1,
|
|
|
|
"Unsupported table magic number --- %lx",
|
|
|
|
(long)table_magic_number);
|
|
|
|
return Status::InvalidArgument(error_msg_buffer);
|
|
|
|
}
|
|
|
|
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
Status SstFileDumper::SetOldTableOptions() {
|
|
|
|
assert(table_properties_ == nullptr);
|
|
|
|
options_.table_factory = std::make_shared<BlockBasedTableFactory>();
|
|
|
|
if (!silent_) {
|
|
|
|
fprintf(stdout, "Sst file format: block-based(old version)\n");
|
|
|
|
}
|
|
|
|
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
2024-02-01 22:35:03 +00:00
|
|
|
Status SstFileDumper::ReadSequential(bool print_kv, uint64_t read_num_limit,
|
2020-06-25 02:30:15 +00:00
|
|
|
bool has_from, const std::string& from_key,
|
|
|
|
bool has_to, const std::string& to_key,
|
|
|
|
bool use_from_as_prefix) {
|
|
|
|
if (!table_reader_) {
|
|
|
|
return init_result_;
|
|
|
|
}
|
|
|
|
|
|
|
|
InternalIterator* iter = table_reader_->NewIterator(
|
|
|
|
read_options_, moptions_.prefix_extractor.get(),
|
|
|
|
/*arena=*/nullptr, /*skip_filters=*/false,
|
|
|
|
TableReaderCaller::kSSTDumpTool);
|
2023-08-30 20:42:04 +00:00
|
|
|
|
|
|
|
const Comparator* ucmp = internal_comparator_.user_comparator();
|
|
|
|
size_t ts_sz = ucmp->timestamp_size();
|
|
|
|
|
|
|
|
Slice from_slice = from_key;
|
|
|
|
Slice to_slice = to_key;
|
|
|
|
std::string from_key_buf, to_key_buf;
|
|
|
|
auto [from, to] = MaybeAddTimestampsToRange(
|
|
|
|
has_from ? &from_slice : nullptr, has_to ? &to_slice : nullptr, ts_sz,
|
|
|
|
&from_key_buf, &to_key_buf);
|
2020-06-25 02:30:15 +00:00
|
|
|
uint64_t i = 0;
|
2023-08-30 20:42:04 +00:00
|
|
|
if (from.has_value()) {
|
2020-06-25 02:30:15 +00:00
|
|
|
InternalKey ikey;
|
2023-08-30 20:42:04 +00:00
|
|
|
ikey.SetMinPossibleForUserKey(from.value());
|
2020-06-25 02:30:15 +00:00
|
|
|
iter->Seek(ikey.Encode());
|
|
|
|
} else {
|
|
|
|
iter->SeekToFirst();
|
|
|
|
}
|
|
|
|
for (; iter->Valid(); iter->Next()) {
|
|
|
|
Slice key = iter->key();
|
|
|
|
Slice value = iter->value();
|
|
|
|
++i;
|
2024-02-01 22:35:03 +00:00
|
|
|
if (read_num_limit > 0 && i > read_num_limit) {
|
2023-12-01 19:15:17 +00:00
|
|
|
break;
|
|
|
|
}
|
2020-06-25 02:30:15 +00:00
|
|
|
|
|
|
|
ParsedInternalKey ikey;
|
2020-10-28 17:11:13 +00:00
|
|
|
Status pik_status = ParseInternalKey(key, &ikey, true /* log_err_key */);
|
|
|
|
if (!pik_status.ok()) {
|
|
|
|
std::cerr << pik_status.getState() << "\n";
|
2020-06-25 02:30:15 +00:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// the key returned is not prefixed with out 'from' key
|
|
|
|
if (use_from_as_prefix && !ikey.user_key.starts_with(from_key)) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
// If end marker was specified, we stop before it
|
2023-08-30 20:42:04 +00:00
|
|
|
if (to.has_value() && ucmp->Compare(ikey.user_key, to.value()) >= 0) {
|
2020-06-25 02:30:15 +00:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (print_kv) {
|
|
|
|
if (!decode_blob_index_ || ikey.type != kTypeBlobIndex) {
|
Wide Column support in ldb (#11754)
Summary:
wide_columns can now be pretty-printed in the following commands
- `./ldb dump_wal`
- `./ldb dump`
- `./ldb idump`
- `./ldb dump_live_files`
- `./ldb scan`
- `./sst_dump --command=scan`
There are opportunities to refactor to reduce some nearly identical code. This PR is initial change to add wide column support in `ldb` and `sst_dump` tool. More PRs to come for the refactor.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11754
Test Plan:
**New Tests added**
- `WideColumnsHelperTest::DumpWideColumns`
- `WideColumnsHelperTest::DumpSliceAsWideColumns`
**Changes added to existing tests**
- `ExternalSSTFileTest::BasicMixed` added to cover mixed case (This test should have been added in https://github.com/facebook/rocksdb/issues/11688). This test does not verify the ldb or sst_dump output. This test was used to create test SST files having some rows with wide columns and some without and the generated SST files were used to manually test sst_dump_tool.
- `createSST()` in `sst_dump_test` now takes `wide_column_one_in` to add wide column value in SST
**dump_wal**
```
./ldb dump_wal --walfile=/tmp/rocksdbtest-226125/db_wide_basic_test_2675429_2308393776696827948/000004.log --print_value --header
```
```
Sequence,Count,ByteSize,Physical Offset,Key(s) : value
1,1,59,0,PUT_ENTITY(0) : 0x:0x68656C6C6F 0x617474725F6E616D6531:0x666F6F 0x617474725F6E616D6532:0x626172
2,1,34,42,PUT_ENTITY(0) : 0x617474725F6F6E65:0x74776F 0x617474725F7468726565:0x666F7572
3,1,17,7d,PUT(0) : 0x7468697264 : 0x62617A
```
**idump**
```
./ldb --db=/tmp/rocksdbtest-226125/db_wide_basic_test_3481961_2308393776696827948/ idump
```
```
'first' seq:1, type:22 => :hello attr_name1:foo attr_name2:bar
'second' seq:2, type:22 => attr_one:two attr_three:four
'third' seq:3, type:1 => baz
Internal keys in range: 3
```
**SST Dump from dump_live_files**
```
./ldb --db=/tmp/rocksdbtest-226125/db_wide_basic_test_3481961_2308393776696827948/ compact
./ldb --db=/tmp/rocksdbtest-226125/db_wide_basic_test_3481961_2308393776696827948/ dump_live_files
```
```
...
==============================
SST Files
==============================
/tmp/rocksdbtest-226125/db_wide_basic_test_3481961_2308393776696827948/000013.sst level:1
------------------------------
Process /tmp/rocksdbtest-226125/db_wide_basic_test_3481961_2308393776696827948/000013.sst
Sst file format: block-based
'first' seq:0, type:22 => :hello attr_name1:foo attr_name2:bar
'second' seq:0, type:22 => attr_one:two attr_three:four
'third' seq:0, type:1 => baz
...
```
**dump**
```
./ldb --db=/tmp/rocksdbtest-226125/db_wide_basic_test_3481961_2308393776696827948/ dump
```
```
first ==> :hello attr_name1:foo attr_name2:bar
second ==> attr_one:two attr_three:four
third ==> baz
Keys in range: 3
```
**scan**
```
./ldb --db=/tmp/rocksdbtest-226125/db_wide_basic_test_3481961_2308393776696827948/ scan
```
```
first : :hello attr_name1:foo attr_name2:bar
second : attr_one:two attr_three:four
third : baz
```
**sst_dump**
```
./sst_dump --file=/tmp/rocksdbtest-226125/db_wide_basic_test_3481961_2308393776696827948/000013.sst --command=scan
```
```
options.env is 0x7ff54b296000
Process /tmp/rocksdbtest-226125/db_wide_basic_test_3481961_2308393776696827948/000013.sst
Sst file format: block-based
from [] to []
'first' seq:0, type:22 => :hello attr_name1:foo attr_name2:bar
'second' seq:0, type:22 => attr_one:two attr_three:four
'third' seq:0, type:1 => baz
```
Reviewed By: ltamasi
Differential Revision: D48837999
Pulled By: jaykorean
fbshipit-source-id: b0280f0589d2b9716bb9b50530ffcabb397d140f
2023-08-30 19:45:52 +00:00
|
|
|
if (ikey.type == kTypeWideColumnEntity) {
|
|
|
|
std::ostringstream oss;
|
|
|
|
const Status s = WideColumnsHelper::DumpSliceAsWideColumns(
|
|
|
|
iter->value(), oss, output_hex_);
|
|
|
|
if (!s.ok()) {
|
|
|
|
fprintf(stderr, "%s => error deserializing wide columns\n",
|
|
|
|
ikey.DebugString(true, output_hex_).c_str());
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
fprintf(stdout, "%s => %s\n",
|
|
|
|
ikey.DebugString(true, output_hex_).c_str(),
|
|
|
|
oss.str().c_str());
|
|
|
|
} else {
|
|
|
|
fprintf(stdout, "%s => %s\n",
|
|
|
|
ikey.DebugString(true, output_hex_).c_str(),
|
|
|
|
value.ToString(output_hex_).c_str());
|
|
|
|
}
|
2020-06-25 02:30:15 +00:00
|
|
|
} else {
|
|
|
|
BlobIndex blob_index;
|
|
|
|
|
|
|
|
const Status s = blob_index.DecodeFrom(value);
|
|
|
|
if (!s.ok()) {
|
|
|
|
fprintf(stderr, "%s => error decoding blob index\n",
|
2020-10-28 17:11:13 +00:00
|
|
|
ikey.DebugString(true, output_hex_).c_str());
|
2020-06-25 02:30:15 +00:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2020-10-28 17:11:13 +00:00
|
|
|
fprintf(stdout, "%s => %s\n",
|
|
|
|
ikey.DebugString(true, output_hex_).c_str(),
|
2020-06-25 02:30:15 +00:00
|
|
|
blob_index.DebugString(output_hex_).c_str());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
read_num_ += i;
|
|
|
|
|
|
|
|
Status ret = iter->status();
|
2024-02-01 22:35:03 +00:00
|
|
|
|
|
|
|
bool verify_num_entries =
|
|
|
|
(read_num_limit == 0 ||
|
|
|
|
read_num_limit == std::numeric_limits<uint64_t>::max()) &&
|
|
|
|
!has_from && !has_to;
|
|
|
|
if (verify_num_entries && ret.ok()) {
|
|
|
|
// Compare the number of entries
|
|
|
|
if (!table_properties_) {
|
|
|
|
fprintf(stderr, "Table properties not available.");
|
|
|
|
} else {
|
|
|
|
// TODO: verify num_range_deletions
|
|
|
|
if (i != table_properties_->num_entries -
|
|
|
|
table_properties_->num_range_deletions) {
|
2024-03-12 18:05:20 +00:00
|
|
|
std::ostringstream oss;
|
|
|
|
oss << "Table property expects "
|
|
|
|
<< table_properties_->num_entries -
|
|
|
|
table_properties_->num_range_deletions
|
|
|
|
<< " entries when excluding range deletions,"
|
|
|
|
<< " but scanning the table returned " << std::to_string(i)
|
|
|
|
<< " entries";
|
|
|
|
ret = Status::Corruption(oss.str());
|
2024-02-01 22:35:03 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-06-25 02:30:15 +00:00
|
|
|
delete iter;
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
Improve / clean up meta block code & integrity (#9163)
Summary:
* Checksums are now checked on meta blocks unless specifically
suppressed or not applicable (e.g. plain table). (Was other way around.)
This means a number of cases that were not checking checksums now are,
including direct read TableProperties in Version::GetTableProperties
(fixed in meta_blocks ReadTableProperties), reading any block from
PersistentCache (fixed in BlockFetcher), read TableProperties in
SstFileDumper (ldb/sst_dump/BackupEngine) before table reader open,
maybe more.
* For that to work, I moved the global_seqno+TableProperties checksum
logic to the shared table/ code, because that is used by many utilies
such as SstFileDumper.
* Also for that to work, we have to know when we're dealing with a block
that has a checksum (trailer), so added that capability to Footer based
on magic number, and from there BlockFetcher.
* Knowledge of trailer presence has also fixed a problem where other
table formats were reading blocks including bytes for a non-existant
trailer--and awkwardly kind-of not using them, e.g. no shared code
checking checksums. (BlockFetcher compression type was populated
incorrectly.) Now we only read what is needed.
* Minimized code duplication and differing/incompatible/awkward
abstractions in meta_blocks.{cc,h} (e.g. SeekTo in metaindex block
without parsing block handle)
* Moved some meta block handling code from table_properties*.*
* Moved some code specific to block-based table from shared table/ code
to BlockBasedTable class. The checksum stuff means we can't completely
separate it, but things that don't need to be in shared table/ code
should not be.
* Use unique_ptr rather than raw ptr in more places. (Note: you can
std::move from unique_ptr to shared_ptr.)
Without enhancements to GetPropertiesOfAllTablesTest (see below),
net reduction of roughly 100 lines of code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9163
Test Plan:
existing tests and
* Enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to verify that
checksums are now checked on direct read of table properties by TableCache
(new test would fail before this change)
* Also enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to test
putting table properties under old meta name
* Also generally enhanced that same test to actually test what it was
supposed to be testing already, by kicking things out of table cache when
we don't want them there.
Reviewed By: ajkr, mrambacher
Differential Revision: D32514757
Pulled By: pdillinger
fbshipit-source-id: 507964b9311d186ae8d1131182290cbd97a99fa9
2021-11-18 19:42:12 +00:00
|
|
|
// Provides TableProperties to API user
|
2020-06-25 02:30:15 +00:00
|
|
|
Status SstFileDumper::ReadTableProperties(
|
|
|
|
std::shared_ptr<const TableProperties>* table_properties) {
|
|
|
|
if (!table_reader_) {
|
|
|
|
return init_result_;
|
|
|
|
}
|
|
|
|
|
|
|
|
*table_properties = table_reader_->GetTableProperties();
|
|
|
|
return init_result_;
|
|
|
|
}
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|