rocksdb/db/version_edit_handler.h

382 lines
14 KiB
C
Raw Normal View History

// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#include "db/version_builder.h"
#include "db/version_edit.h"
#include "db/version_set.h"
namespace ROCKSDB_NAMESPACE {
struct FileMetaData;
class VersionEditHandlerBase {
public:
Group rocksdb.sst.read.micros stat by IOActivity flush and compaction (#11288) Summary: **Context:** The existing stat rocksdb.sst.read.micros does not reflect each of compaction and flush cases but aggregate them, which is not so helpful for us to understand IO read behavior of each of them. **Summary** - Update `StopWatch` and `RandomAccessFileReader` to record `rocksdb.sst.read.micros` and `rocksdb.file.{flush/compaction}.read.micros` - Fixed the default histogram in `RandomAccessFileReader` - New field `ReadOptions/IOOptions::io_activity`; Pass `ReadOptions` through paths under db open, flush and compaction to where we can prepare `IOOptions` and pass it to `RandomAccessFileReader` - Use `thread_status_util` for assertion in `DbStressFSWrapper` for continuous testing on we are passing correct `io_activity` under db open, flush and compaction Pull Request resolved: https://github.com/facebook/rocksdb/pull/11288 Test Plan: - **Stress test** - **Db bench 1: rocksdb.sst.read.micros COUNT ≈ sum of rocksdb.file.read.flush.micros's and rocksdb.file.read.compaction.micros's.** (without blob) - May not be exactly the same due to `HistogramStat::Add` only guarantees atomic not accuracy across threads. ``` ./db_bench -db=/dev/shm/testdb/ -statistics=true -benchmarks="fillseq" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -target_file_size_base=655 -disable_auto_compactions=false -compression_type=none -bloom_bits=3 (-use_plain_table=1 -prefix_size=10) ``` ``` // BlockBasedTable rocksdb.sst.read.micros P50 : 2.009374 P95 : 4.968548 P99 : 8.110362 P100 : 43.000000 COUNT : 40456 SUM : 114805 rocksdb.file.read.flush.micros P50 : 1.871841 P95 : 3.872407 P99 : 5.540541 P100 : 43.000000 COUNT : 2250 SUM : 6116 rocksdb.file.read.compaction.micros P50 : 2.023109 P95 : 5.029149 P99 : 8.196910 P100 : 26.000000 COUNT : 38206 SUM : 108689 // PlainTable Does not apply ``` - **Db bench 2: performance** **Read** SETUP: db with 900 files ``` ./db_bench -db=/dev/shm/testdb/ -benchmarks="fillseq" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -disable_auto_compactions=true -target_file_size_base=655 -compression_type=none ```run till convergence ``` ./db_bench -seed=1678564177044286 -use_existing_db=true -db=/dev/shm/testdb -benchmarks=readrandom[-X60] -statistics=true -num=1000000 -disable_auto_compactions=true -compression_type=none -bloom_bits=3 ``` Pre-change `readrandom [AVG 60 runs] : 21568 (± 248) ops/sec` Post-change (no regression, -0.3%) `readrandom [AVG 60 runs] : 21486 (± 236) ops/sec` **Compaction/Flush**run till convergence ``` ./db_bench -db=/dev/shm/testdb2/ -seed=1678564177044286 -benchmarks="fillseq[-X60]" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -disable_auto_compactions=false -target_file_size_base=655 -compression_type=none rocksdb.sst.read.micros COUNT : 33820 rocksdb.sst.read.flush.micros COUNT : 1800 rocksdb.sst.read.compaction.micros COUNT : 32020 ``` Pre-change `fillseq [AVG 46 runs] : 1391 (± 214) ops/sec; 0.7 (± 0.1) MB/sec` Post-change (no regression, ~-0.4%) `fillseq [AVG 46 runs] : 1385 (± 216) ops/sec; 0.7 (± 0.1) MB/sec` Reviewed By: ajkr Differential Revision: D44007011 Pulled By: hx235 fbshipit-source-id: a54c89e4846dfc9a135389edf3f3eedfea257132
2023-04-21 16:07:18 +00:00
explicit VersionEditHandlerBase(const ReadOptions& read_options)
: read_options_(read_options),
max_manifest_read_size_(std::numeric_limits<uint64_t>::max()) {}
virtual ~VersionEditHandlerBase() {}
void Iterate(log::Reader& reader, Status* log_read_status);
const Status& status() const { return status_; }
AtomicGroupReadBuffer& GetReadBuffer() { return read_buffer_; }
protected:
Group rocksdb.sst.read.micros stat by IOActivity flush and compaction (#11288) Summary: **Context:** The existing stat rocksdb.sst.read.micros does not reflect each of compaction and flush cases but aggregate them, which is not so helpful for us to understand IO read behavior of each of them. **Summary** - Update `StopWatch` and `RandomAccessFileReader` to record `rocksdb.sst.read.micros` and `rocksdb.file.{flush/compaction}.read.micros` - Fixed the default histogram in `RandomAccessFileReader` - New field `ReadOptions/IOOptions::io_activity`; Pass `ReadOptions` through paths under db open, flush and compaction to where we can prepare `IOOptions` and pass it to `RandomAccessFileReader` - Use `thread_status_util` for assertion in `DbStressFSWrapper` for continuous testing on we are passing correct `io_activity` under db open, flush and compaction Pull Request resolved: https://github.com/facebook/rocksdb/pull/11288 Test Plan: - **Stress test** - **Db bench 1: rocksdb.sst.read.micros COUNT ≈ sum of rocksdb.file.read.flush.micros's and rocksdb.file.read.compaction.micros's.** (without blob) - May not be exactly the same due to `HistogramStat::Add` only guarantees atomic not accuracy across threads. ``` ./db_bench -db=/dev/shm/testdb/ -statistics=true -benchmarks="fillseq" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -target_file_size_base=655 -disable_auto_compactions=false -compression_type=none -bloom_bits=3 (-use_plain_table=1 -prefix_size=10) ``` ``` // BlockBasedTable rocksdb.sst.read.micros P50 : 2.009374 P95 : 4.968548 P99 : 8.110362 P100 : 43.000000 COUNT : 40456 SUM : 114805 rocksdb.file.read.flush.micros P50 : 1.871841 P95 : 3.872407 P99 : 5.540541 P100 : 43.000000 COUNT : 2250 SUM : 6116 rocksdb.file.read.compaction.micros P50 : 2.023109 P95 : 5.029149 P99 : 8.196910 P100 : 26.000000 COUNT : 38206 SUM : 108689 // PlainTable Does not apply ``` - **Db bench 2: performance** **Read** SETUP: db with 900 files ``` ./db_bench -db=/dev/shm/testdb/ -benchmarks="fillseq" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -disable_auto_compactions=true -target_file_size_base=655 -compression_type=none ```run till convergence ``` ./db_bench -seed=1678564177044286 -use_existing_db=true -db=/dev/shm/testdb -benchmarks=readrandom[-X60] -statistics=true -num=1000000 -disable_auto_compactions=true -compression_type=none -bloom_bits=3 ``` Pre-change `readrandom [AVG 60 runs] : 21568 (± 248) ops/sec` Post-change (no regression, -0.3%) `readrandom [AVG 60 runs] : 21486 (± 236) ops/sec` **Compaction/Flush**run till convergence ``` ./db_bench -db=/dev/shm/testdb2/ -seed=1678564177044286 -benchmarks="fillseq[-X60]" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -disable_auto_compactions=false -target_file_size_base=655 -compression_type=none rocksdb.sst.read.micros COUNT : 33820 rocksdb.sst.read.flush.micros COUNT : 1800 rocksdb.sst.read.compaction.micros COUNT : 32020 ``` Pre-change `fillseq [AVG 46 runs] : 1391 (± 214) ops/sec; 0.7 (± 0.1) MB/sec` Post-change (no regression, ~-0.4%) `fillseq [AVG 46 runs] : 1385 (± 216) ops/sec; 0.7 (± 0.1) MB/sec` Reviewed By: ajkr Differential Revision: D44007011 Pulled By: hx235 fbshipit-source-id: a54c89e4846dfc9a135389edf3f3eedfea257132
2023-04-21 16:07:18 +00:00
explicit VersionEditHandlerBase(const ReadOptions& read_options,
uint64_t max_read_size)
: read_options_(read_options), max_manifest_read_size_(max_read_size) {}
virtual Status Initialize() { return Status::OK(); }
virtual Status ApplyVersionEdit(VersionEdit& edit,
ColumnFamilyData** cfd) = 0;
Best-effort recovery support for atomic flush (#12406) Summary: This PR updates `VersionEditHandlerPointInTime` to recover all or none of the updates in an AtomicGroup. This makes best-effort recovery properly handle atomic flushes during recovery, so the features are now allowed to both be enabled at once. The new logic requires that AtomicGroups do not contain column family additions or removals. AtomicGroups are currently written for atomic flush, which does not include such edits. Column family additions or removals are recovered independently of AtomicGroups. The new logic needs to be aware of removal, though, so that a dropped CF does not prevent completion of an AtomicGroup recovery. The new logic treats each AtomicGroup as if it contains updates for all existing column families, even though it is possible to create AtomicGroups that only affect a subset of column families. This simplifies the logic at the expense of recovering less data in certain edge case scenarios. The usage of `MaybeCreateVersion()` is pretty tricky. The goal is to create a barrier at the start of an AtomicGroup such that all valid states up to that point will be applied to `versions_`. Here is a summary. - `MaybeCreateVersion(..., false)` creates a `Version` on a negative edge trigger (transition from valid to invalid). It was previously called when applying each update. Now, it is only called when applying non-AtomicGroup updates. - `MaybeCreateVersion(..., true)` creates a `Version` on a positive level trigger (valid state). It was previously called only at the end of iteration. Now, it is additionally called before processing an AtomicGroup. Pull Request resolved: https://github.com/facebook/rocksdb/pull/12406 Reviewed By: jaykorean, cbi42 Differential Revision: D54494904 Pulled By: ajkr fbshipit-source-id: 0114a9fe1d04b471d086dcab5978ea8a3a56ad52
2024-03-06 22:40:40 +00:00
virtual Status OnAtomicGroupReplayBegin() { return Status::OK(); }
virtual Status OnAtomicGroupReplayEnd() { return Status::OK(); }
virtual void CheckIterationResult(const log::Reader& /*reader*/,
Status* /*s*/) {}
void ClearReadBuffer() { read_buffer_.Clear(); }
Status status_;
Group rocksdb.sst.read.micros stat by IOActivity flush and compaction (#11288) Summary: **Context:** The existing stat rocksdb.sst.read.micros does not reflect each of compaction and flush cases but aggregate them, which is not so helpful for us to understand IO read behavior of each of them. **Summary** - Update `StopWatch` and `RandomAccessFileReader` to record `rocksdb.sst.read.micros` and `rocksdb.file.{flush/compaction}.read.micros` - Fixed the default histogram in `RandomAccessFileReader` - New field `ReadOptions/IOOptions::io_activity`; Pass `ReadOptions` through paths under db open, flush and compaction to where we can prepare `IOOptions` and pass it to `RandomAccessFileReader` - Use `thread_status_util` for assertion in `DbStressFSWrapper` for continuous testing on we are passing correct `io_activity` under db open, flush and compaction Pull Request resolved: https://github.com/facebook/rocksdb/pull/11288 Test Plan: - **Stress test** - **Db bench 1: rocksdb.sst.read.micros COUNT ≈ sum of rocksdb.file.read.flush.micros's and rocksdb.file.read.compaction.micros's.** (without blob) - May not be exactly the same due to `HistogramStat::Add` only guarantees atomic not accuracy across threads. ``` ./db_bench -db=/dev/shm/testdb/ -statistics=true -benchmarks="fillseq" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -target_file_size_base=655 -disable_auto_compactions=false -compression_type=none -bloom_bits=3 (-use_plain_table=1 -prefix_size=10) ``` ``` // BlockBasedTable rocksdb.sst.read.micros P50 : 2.009374 P95 : 4.968548 P99 : 8.110362 P100 : 43.000000 COUNT : 40456 SUM : 114805 rocksdb.file.read.flush.micros P50 : 1.871841 P95 : 3.872407 P99 : 5.540541 P100 : 43.000000 COUNT : 2250 SUM : 6116 rocksdb.file.read.compaction.micros P50 : 2.023109 P95 : 5.029149 P99 : 8.196910 P100 : 26.000000 COUNT : 38206 SUM : 108689 // PlainTable Does not apply ``` - **Db bench 2: performance** **Read** SETUP: db with 900 files ``` ./db_bench -db=/dev/shm/testdb/ -benchmarks="fillseq" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -disable_auto_compactions=true -target_file_size_base=655 -compression_type=none ```run till convergence ``` ./db_bench -seed=1678564177044286 -use_existing_db=true -db=/dev/shm/testdb -benchmarks=readrandom[-X60] -statistics=true -num=1000000 -disable_auto_compactions=true -compression_type=none -bloom_bits=3 ``` Pre-change `readrandom [AVG 60 runs] : 21568 (± 248) ops/sec` Post-change (no regression, -0.3%) `readrandom [AVG 60 runs] : 21486 (± 236) ops/sec` **Compaction/Flush**run till convergence ``` ./db_bench -db=/dev/shm/testdb2/ -seed=1678564177044286 -benchmarks="fillseq[-X60]" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -disable_auto_compactions=false -target_file_size_base=655 -compression_type=none rocksdb.sst.read.micros COUNT : 33820 rocksdb.sst.read.flush.micros COUNT : 1800 rocksdb.sst.read.compaction.micros COUNT : 32020 ``` Pre-change `fillseq [AVG 46 runs] : 1391 (± 214) ops/sec; 0.7 (± 0.1) MB/sec` Post-change (no regression, ~-0.4%) `fillseq [AVG 46 runs] : 1385 (± 216) ops/sec; 0.7 (± 0.1) MB/sec` Reviewed By: ajkr Differential Revision: D44007011 Pulled By: hx235 fbshipit-source-id: a54c89e4846dfc9a135389edf3f3eedfea257132
2023-04-21 16:07:18 +00:00
const ReadOptions& read_options_;
private:
AtomicGroupReadBuffer read_buffer_;
const uint64_t max_manifest_read_size_;
};
class ListColumnFamiliesHandler : public VersionEditHandlerBase {
public:
Group rocksdb.sst.read.micros stat by IOActivity flush and compaction (#11288) Summary: **Context:** The existing stat rocksdb.sst.read.micros does not reflect each of compaction and flush cases but aggregate them, which is not so helpful for us to understand IO read behavior of each of them. **Summary** - Update `StopWatch` and `RandomAccessFileReader` to record `rocksdb.sst.read.micros` and `rocksdb.file.{flush/compaction}.read.micros` - Fixed the default histogram in `RandomAccessFileReader` - New field `ReadOptions/IOOptions::io_activity`; Pass `ReadOptions` through paths under db open, flush and compaction to where we can prepare `IOOptions` and pass it to `RandomAccessFileReader` - Use `thread_status_util` for assertion in `DbStressFSWrapper` for continuous testing on we are passing correct `io_activity` under db open, flush and compaction Pull Request resolved: https://github.com/facebook/rocksdb/pull/11288 Test Plan: - **Stress test** - **Db bench 1: rocksdb.sst.read.micros COUNT ≈ sum of rocksdb.file.read.flush.micros's and rocksdb.file.read.compaction.micros's.** (without blob) - May not be exactly the same due to `HistogramStat::Add` only guarantees atomic not accuracy across threads. ``` ./db_bench -db=/dev/shm/testdb/ -statistics=true -benchmarks="fillseq" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -target_file_size_base=655 -disable_auto_compactions=false -compression_type=none -bloom_bits=3 (-use_plain_table=1 -prefix_size=10) ``` ``` // BlockBasedTable rocksdb.sst.read.micros P50 : 2.009374 P95 : 4.968548 P99 : 8.110362 P100 : 43.000000 COUNT : 40456 SUM : 114805 rocksdb.file.read.flush.micros P50 : 1.871841 P95 : 3.872407 P99 : 5.540541 P100 : 43.000000 COUNT : 2250 SUM : 6116 rocksdb.file.read.compaction.micros P50 : 2.023109 P95 : 5.029149 P99 : 8.196910 P100 : 26.000000 COUNT : 38206 SUM : 108689 // PlainTable Does not apply ``` - **Db bench 2: performance** **Read** SETUP: db with 900 files ``` ./db_bench -db=/dev/shm/testdb/ -benchmarks="fillseq" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -disable_auto_compactions=true -target_file_size_base=655 -compression_type=none ```run till convergence ``` ./db_bench -seed=1678564177044286 -use_existing_db=true -db=/dev/shm/testdb -benchmarks=readrandom[-X60] -statistics=true -num=1000000 -disable_auto_compactions=true -compression_type=none -bloom_bits=3 ``` Pre-change `readrandom [AVG 60 runs] : 21568 (± 248) ops/sec` Post-change (no regression, -0.3%) `readrandom [AVG 60 runs] : 21486 (± 236) ops/sec` **Compaction/Flush**run till convergence ``` ./db_bench -db=/dev/shm/testdb2/ -seed=1678564177044286 -benchmarks="fillseq[-X60]" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -disable_auto_compactions=false -target_file_size_base=655 -compression_type=none rocksdb.sst.read.micros COUNT : 33820 rocksdb.sst.read.flush.micros COUNT : 1800 rocksdb.sst.read.compaction.micros COUNT : 32020 ``` Pre-change `fillseq [AVG 46 runs] : 1391 (± 214) ops/sec; 0.7 (± 0.1) MB/sec` Post-change (no regression, ~-0.4%) `fillseq [AVG 46 runs] : 1385 (± 216) ops/sec; 0.7 (± 0.1) MB/sec` Reviewed By: ajkr Differential Revision: D44007011 Pulled By: hx235 fbshipit-source-id: a54c89e4846dfc9a135389edf3f3eedfea257132
2023-04-21 16:07:18 +00:00
explicit ListColumnFamiliesHandler(const ReadOptions& read_options)
: VersionEditHandlerBase(read_options) {}
~ListColumnFamiliesHandler() override {}
const std::map<uint32_t, std::string> GetColumnFamilyNames() const {
return column_family_names_;
}
protected:
Status ApplyVersionEdit(VersionEdit& edit,
ColumnFamilyData** /*unused*/) override;
private:
// default column family is always implicitly there
std::map<uint32_t, std::string> column_family_names_{
{0, kDefaultColumnFamilyName}};
};
class FileChecksumRetriever : public VersionEditHandlerBase {
public:
Group rocksdb.sst.read.micros stat by IOActivity flush and compaction (#11288) Summary: **Context:** The existing stat rocksdb.sst.read.micros does not reflect each of compaction and flush cases but aggregate them, which is not so helpful for us to understand IO read behavior of each of them. **Summary** - Update `StopWatch` and `RandomAccessFileReader` to record `rocksdb.sst.read.micros` and `rocksdb.file.{flush/compaction}.read.micros` - Fixed the default histogram in `RandomAccessFileReader` - New field `ReadOptions/IOOptions::io_activity`; Pass `ReadOptions` through paths under db open, flush and compaction to where we can prepare `IOOptions` and pass it to `RandomAccessFileReader` - Use `thread_status_util` for assertion in `DbStressFSWrapper` for continuous testing on we are passing correct `io_activity` under db open, flush and compaction Pull Request resolved: https://github.com/facebook/rocksdb/pull/11288 Test Plan: - **Stress test** - **Db bench 1: rocksdb.sst.read.micros COUNT ≈ sum of rocksdb.file.read.flush.micros's and rocksdb.file.read.compaction.micros's.** (without blob) - May not be exactly the same due to `HistogramStat::Add` only guarantees atomic not accuracy across threads. ``` ./db_bench -db=/dev/shm/testdb/ -statistics=true -benchmarks="fillseq" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -target_file_size_base=655 -disable_auto_compactions=false -compression_type=none -bloom_bits=3 (-use_plain_table=1 -prefix_size=10) ``` ``` // BlockBasedTable rocksdb.sst.read.micros P50 : 2.009374 P95 : 4.968548 P99 : 8.110362 P100 : 43.000000 COUNT : 40456 SUM : 114805 rocksdb.file.read.flush.micros P50 : 1.871841 P95 : 3.872407 P99 : 5.540541 P100 : 43.000000 COUNT : 2250 SUM : 6116 rocksdb.file.read.compaction.micros P50 : 2.023109 P95 : 5.029149 P99 : 8.196910 P100 : 26.000000 COUNT : 38206 SUM : 108689 // PlainTable Does not apply ``` - **Db bench 2: performance** **Read** SETUP: db with 900 files ``` ./db_bench -db=/dev/shm/testdb/ -benchmarks="fillseq" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -disable_auto_compactions=true -target_file_size_base=655 -compression_type=none ```run till convergence ``` ./db_bench -seed=1678564177044286 -use_existing_db=true -db=/dev/shm/testdb -benchmarks=readrandom[-X60] -statistics=true -num=1000000 -disable_auto_compactions=true -compression_type=none -bloom_bits=3 ``` Pre-change `readrandom [AVG 60 runs] : 21568 (± 248) ops/sec` Post-change (no regression, -0.3%) `readrandom [AVG 60 runs] : 21486 (± 236) ops/sec` **Compaction/Flush**run till convergence ``` ./db_bench -db=/dev/shm/testdb2/ -seed=1678564177044286 -benchmarks="fillseq[-X60]" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -disable_auto_compactions=false -target_file_size_base=655 -compression_type=none rocksdb.sst.read.micros COUNT : 33820 rocksdb.sst.read.flush.micros COUNT : 1800 rocksdb.sst.read.compaction.micros COUNT : 32020 ``` Pre-change `fillseq [AVG 46 runs] : 1391 (± 214) ops/sec; 0.7 (± 0.1) MB/sec` Post-change (no regression, ~-0.4%) `fillseq [AVG 46 runs] : 1385 (± 216) ops/sec; 0.7 (± 0.1) MB/sec` Reviewed By: ajkr Differential Revision: D44007011 Pulled By: hx235 fbshipit-source-id: a54c89e4846dfc9a135389edf3f3eedfea257132
2023-04-21 16:07:18 +00:00
FileChecksumRetriever(const ReadOptions& read_options, uint64_t max_read_size,
FileChecksumList& file_checksum_list)
Group rocksdb.sst.read.micros stat by IOActivity flush and compaction (#11288) Summary: **Context:** The existing stat rocksdb.sst.read.micros does not reflect each of compaction and flush cases but aggregate them, which is not so helpful for us to understand IO read behavior of each of them. **Summary** - Update `StopWatch` and `RandomAccessFileReader` to record `rocksdb.sst.read.micros` and `rocksdb.file.{flush/compaction}.read.micros` - Fixed the default histogram in `RandomAccessFileReader` - New field `ReadOptions/IOOptions::io_activity`; Pass `ReadOptions` through paths under db open, flush and compaction to where we can prepare `IOOptions` and pass it to `RandomAccessFileReader` - Use `thread_status_util` for assertion in `DbStressFSWrapper` for continuous testing on we are passing correct `io_activity` under db open, flush and compaction Pull Request resolved: https://github.com/facebook/rocksdb/pull/11288 Test Plan: - **Stress test** - **Db bench 1: rocksdb.sst.read.micros COUNT ≈ sum of rocksdb.file.read.flush.micros's and rocksdb.file.read.compaction.micros's.** (without blob) - May not be exactly the same due to `HistogramStat::Add` only guarantees atomic not accuracy across threads. ``` ./db_bench -db=/dev/shm/testdb/ -statistics=true -benchmarks="fillseq" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -target_file_size_base=655 -disable_auto_compactions=false -compression_type=none -bloom_bits=3 (-use_plain_table=1 -prefix_size=10) ``` ``` // BlockBasedTable rocksdb.sst.read.micros P50 : 2.009374 P95 : 4.968548 P99 : 8.110362 P100 : 43.000000 COUNT : 40456 SUM : 114805 rocksdb.file.read.flush.micros P50 : 1.871841 P95 : 3.872407 P99 : 5.540541 P100 : 43.000000 COUNT : 2250 SUM : 6116 rocksdb.file.read.compaction.micros P50 : 2.023109 P95 : 5.029149 P99 : 8.196910 P100 : 26.000000 COUNT : 38206 SUM : 108689 // PlainTable Does not apply ``` - **Db bench 2: performance** **Read** SETUP: db with 900 files ``` ./db_bench -db=/dev/shm/testdb/ -benchmarks="fillseq" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -disable_auto_compactions=true -target_file_size_base=655 -compression_type=none ```run till convergence ``` ./db_bench -seed=1678564177044286 -use_existing_db=true -db=/dev/shm/testdb -benchmarks=readrandom[-X60] -statistics=true -num=1000000 -disable_auto_compactions=true -compression_type=none -bloom_bits=3 ``` Pre-change `readrandom [AVG 60 runs] : 21568 (± 248) ops/sec` Post-change (no regression, -0.3%) `readrandom [AVG 60 runs] : 21486 (± 236) ops/sec` **Compaction/Flush**run till convergence ``` ./db_bench -db=/dev/shm/testdb2/ -seed=1678564177044286 -benchmarks="fillseq[-X60]" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -disable_auto_compactions=false -target_file_size_base=655 -compression_type=none rocksdb.sst.read.micros COUNT : 33820 rocksdb.sst.read.flush.micros COUNT : 1800 rocksdb.sst.read.compaction.micros COUNT : 32020 ``` Pre-change `fillseq [AVG 46 runs] : 1391 (± 214) ops/sec; 0.7 (± 0.1) MB/sec` Post-change (no regression, ~-0.4%) `fillseq [AVG 46 runs] : 1385 (± 216) ops/sec; 0.7 (± 0.1) MB/sec` Reviewed By: ajkr Differential Revision: D44007011 Pulled By: hx235 fbshipit-source-id: a54c89e4846dfc9a135389edf3f3eedfea257132
2023-04-21 16:07:18 +00:00
: VersionEditHandlerBase(read_options, max_read_size),
file_checksum_list_(file_checksum_list) {}
~FileChecksumRetriever() override {}
protected:
Status ApplyVersionEdit(VersionEdit& edit,
ColumnFamilyData** /*unused*/) override;
private:
FileChecksumList& file_checksum_list_;
};
using VersionBuilderUPtr = std::unique_ptr<BaseReferencedVersionBuilder>;
// A class used for scanning MANIFEST file.
// VersionEditHandler reads a MANIFEST file, parses the version edits, and
// builds the version set's in-memory state, e.g. the version storage info for
// the versions of column families.
// To use this class and its subclasses,
// 1. Create an object of VersionEditHandler or its subclasses.
// VersionEditHandler handler(read_only, column_families, version_set,
// track_missing_files,
// no_error_if_files_missing);
// 2. Status s = handler.Iterate(reader, &db_id);
// 3. Check s and handle possible errors.
//
// Not thread-safe, external synchronization is necessary if an object of
// VersionEditHandler is shared by multiple threads.
class VersionEditHandler : public VersionEditHandlerBase {
public:
explicit VersionEditHandler(
bool read_only,
const std::vector<ColumnFamilyDescriptor>& column_families,
VersionSet* version_set, bool track_missing_files,
bool no_error_if_files_missing,
Sort L0 files by newly introduced epoch_num (#10922) Summary: **Context:** Sorting L0 files by `largest_seqno` has at least two inconvenience: - File ingestion and compaction involving ingested files can create files of overlapping seqno range with the existing files. `force_consistency_check=true` will catch such overlap seqno range even those harmless overlap. - For example, consider the following sequence of events ("key@n" indicates key at seqno "n") - insert k1@1 to memtable m1 - ingest file s1 with k2@2, ingest file s2 with k3@3 - insert k4@4 to m1 - compact files s1, s2 and result in new file s3 of seqno range [2, 3] - flush m1 and result in new file s4 of seqno range [1, 4]. And `force_consistency_check=true` will think s4 and s3 has file reordering corruption that might cause retuning an old value of k1 - However such caught corruption is a false positive since s1, s2 will not have overlapped keys with k1 or whatever inserted into m1 before ingest file s1 by the requirement of file ingestion (otherwise the m1 will be flushed first before any of the file ingestion completes). Therefore there in fact isn't any file reordering corruption. - Single delete can decrease a file's largest seqno and ordering by `largest_seqno` can introduce a wrong ordering hence file reordering corruption - For example, consider the following sequence of events ("key@n" indicates key at seqno "n", Credit to ajkr for this example) - an existing SST s1 contains only k1@1 - insert k1@2 to memtable m1 - ingest file s2 with k3@3, ingest file s3 with k4@4 - insert single delete k5@5 in m1 - flush m1 and result in new file s4 of seqno range [2, 5] - compact s1, s2, s3 and result in new file s5 of seqno range [1, 4] - compact s4 and result in new file s6 of seqno range [2] due to single delete - By the last step, we have file ordering by largest seqno (">" means "newer") : s5 > s6 while s6 contains a newer version of the k1's value (i.e, k1@2) than s5, which is a real reordering corruption. While this can be caught by `force_consistency_check=true`, there isn't a good way to prevent this from happening if ordering by `largest_seqno` Therefore, we are redesigning the sorting criteria of L0 files and avoid above inconvenience. Credit to ajkr , we now introduce `epoch_num` which describes the order of a file being flushed or ingested/imported (compaction output file will has the minimum `epoch_num` among input files'). This will avoid the above inconvenience in the following ways: - In the first case above, there will no longer be overlap seqno range check in `force_consistency_check=true` but `epoch_number` ordering check. This will result in file ordering s1 < s2 < s4 (pre-compaction) and s3 < s4 (post-compaction) which won't trigger false positive corruption. See test class `DBCompactionTestL0FilesMisorderCorruption*` for more. - In the second case above, this will result in file ordering s1 < s2 < s3 < s4 (pre-compacting s1, s2, s3), s5 < s4 (post-compacting s1, s2, s3), s5 < s6 (post-compacting s4), which are correct file ordering without causing any corruption. **Summary:** - Introduce `epoch_number` stored per `ColumnFamilyData` and sort CF's L0 files by their assigned `epoch_number` instead of `largest_seqno`. - `epoch_number` is increased and assigned upon `VersionEdit::AddFile()` for flush (or similarly for WriteLevel0TableForRecovery) and file ingestion (except for allow_behind_true, which will always get assigned as the `kReservedEpochNumberForFileIngestedBehind`) - Compaction output file is assigned with the minimum `epoch_number` among input files' - Refit level: reuse refitted file's epoch_number - Other paths needing `epoch_number` treatment: - Import column families: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo` - Repair: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`. - Assigning new epoch_number to a file and adding this file to LSM tree should be atomic. This is guaranteed by us assigning epoch_number right upon `VersionEdit::AddFile()` where this version edit will be apply to LSM tree shape right after by holding the db mutex (e.g, flush, file ingestion, import column family) or by there is only 1 ongoing edit per CF (e.g, WriteLevel0TableForRecovery, Repair). - Assigning the minimum input epoch number to compaction output file won't misorder L0 files (even through later `Refit(target_level=0)`). It's due to for every key "k" in the input range, a legit compaction will cover a continuous epoch number range of that key. As long as we assign the key "k" the minimum input epoch number, it won't become newer or older than the versions of this key that aren't included in this compaction hence no misorder. - Persist `epoch_number` of each file in manifest and recover `epoch_number` on db recovery - Backward compatibility with old db without `epoch_number` support is guaranteed by assigning `epoch_number` to recovered files by `NewestFirstBySeqno` order. See `VersionStorageInfo::RecoverEpochNumbers()` for more - Forward compatibility with manifest is guaranteed by flexibility of `NewFileCustomTag` - Replace `force_consistent_check` on L0 with `epoch_number` and remove false positive check like case 1 with `largest_seqno` above - Due to backward compatibility issue, we might encounter files with missing epoch number at the beginning of db recovery. We will still use old L0 sorting mechanism (`NewestFirstBySeqno`) to check/sort them till we infer their epoch number. See usages of `EpochNumberRequirement`. - Remove fix https://github.com/facebook/rocksdb/pull/5958#issue-511150930 and their outdated tests to file reordering corruption because such fix can be replaced by this PR. - Misc: - update existing tests with `epoch_number` so make check will pass - update https://github.com/facebook/rocksdb/pull/5958#issue-511150930 tests to verify corruption is fixed using `epoch_number` and cover universal/fifo compaction/CompactRange/CompactFile cases - assert db_mutex is held for a few places before calling ColumnFamilyData::NewEpochNumber() Pull Request resolved: https://github.com/facebook/rocksdb/pull/10922 Test Plan: - `make check` - New unit tests under `db/db_compaction_test.cc`, `db/db_test2.cc`, `db/version_builder_test.cc`, `db/repair_test.cc` - Updated tests (i.e, `DBCompactionTestL0FilesMisorderCorruption*`) under https://github.com/facebook/rocksdb/pull/5958#issue-511150930 - [Ongoing] Compatibility test: manually run https://github.com/ajkr/rocksdb/commit/36a5686ec012f35a4371e409aa85c404ca1c210d (with file ingestion off for running the `.orig` binary to prevent this bug affecting upgrade/downgrade formality checking) for 1 hour on `simple black/white box`, `cf_consistency/txn/enable_ts with whitebox + test_best_efforts_recovery with blackbox` - [Ongoing] normal db stress test - [Ongoing] db stress test with aggressive value https://github.com/facebook/rocksdb/pull/10761 Reviewed By: ajkr Differential Revision: D41063187 Pulled By: hx235 fbshipit-source-id: 826cb23455de7beaabe2d16c57682a82733a32a9
2022-12-13 21:29:37 +00:00
const std::shared_ptr<IOTracer>& io_tracer,
Group rocksdb.sst.read.micros stat by IOActivity flush and compaction (#11288) Summary: **Context:** The existing stat rocksdb.sst.read.micros does not reflect each of compaction and flush cases but aggregate them, which is not so helpful for us to understand IO read behavior of each of them. **Summary** - Update `StopWatch` and `RandomAccessFileReader` to record `rocksdb.sst.read.micros` and `rocksdb.file.{flush/compaction}.read.micros` - Fixed the default histogram in `RandomAccessFileReader` - New field `ReadOptions/IOOptions::io_activity`; Pass `ReadOptions` through paths under db open, flush and compaction to where we can prepare `IOOptions` and pass it to `RandomAccessFileReader` - Use `thread_status_util` for assertion in `DbStressFSWrapper` for continuous testing on we are passing correct `io_activity` under db open, flush and compaction Pull Request resolved: https://github.com/facebook/rocksdb/pull/11288 Test Plan: - **Stress test** - **Db bench 1: rocksdb.sst.read.micros COUNT ≈ sum of rocksdb.file.read.flush.micros's and rocksdb.file.read.compaction.micros's.** (without blob) - May not be exactly the same due to `HistogramStat::Add` only guarantees atomic not accuracy across threads. ``` ./db_bench -db=/dev/shm/testdb/ -statistics=true -benchmarks="fillseq" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -target_file_size_base=655 -disable_auto_compactions=false -compression_type=none -bloom_bits=3 (-use_plain_table=1 -prefix_size=10) ``` ``` // BlockBasedTable rocksdb.sst.read.micros P50 : 2.009374 P95 : 4.968548 P99 : 8.110362 P100 : 43.000000 COUNT : 40456 SUM : 114805 rocksdb.file.read.flush.micros P50 : 1.871841 P95 : 3.872407 P99 : 5.540541 P100 : 43.000000 COUNT : 2250 SUM : 6116 rocksdb.file.read.compaction.micros P50 : 2.023109 P95 : 5.029149 P99 : 8.196910 P100 : 26.000000 COUNT : 38206 SUM : 108689 // PlainTable Does not apply ``` - **Db bench 2: performance** **Read** SETUP: db with 900 files ``` ./db_bench -db=/dev/shm/testdb/ -benchmarks="fillseq" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -disable_auto_compactions=true -target_file_size_base=655 -compression_type=none ```run till convergence ``` ./db_bench -seed=1678564177044286 -use_existing_db=true -db=/dev/shm/testdb -benchmarks=readrandom[-X60] -statistics=true -num=1000000 -disable_auto_compactions=true -compression_type=none -bloom_bits=3 ``` Pre-change `readrandom [AVG 60 runs] : 21568 (± 248) ops/sec` Post-change (no regression, -0.3%) `readrandom [AVG 60 runs] : 21486 (± 236) ops/sec` **Compaction/Flush**run till convergence ``` ./db_bench -db=/dev/shm/testdb2/ -seed=1678564177044286 -benchmarks="fillseq[-X60]" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -disable_auto_compactions=false -target_file_size_base=655 -compression_type=none rocksdb.sst.read.micros COUNT : 33820 rocksdb.sst.read.flush.micros COUNT : 1800 rocksdb.sst.read.compaction.micros COUNT : 32020 ``` Pre-change `fillseq [AVG 46 runs] : 1391 (± 214) ops/sec; 0.7 (± 0.1) MB/sec` Post-change (no regression, ~-0.4%) `fillseq [AVG 46 runs] : 1385 (± 216) ops/sec; 0.7 (± 0.1) MB/sec` Reviewed By: ajkr Differential Revision: D44007011 Pulled By: hx235 fbshipit-source-id: a54c89e4846dfc9a135389edf3f3eedfea257132
2023-04-21 16:07:18 +00:00
const ReadOptions& read_options,
Sort L0 files by newly introduced epoch_num (#10922) Summary: **Context:** Sorting L0 files by `largest_seqno` has at least two inconvenience: - File ingestion and compaction involving ingested files can create files of overlapping seqno range with the existing files. `force_consistency_check=true` will catch such overlap seqno range even those harmless overlap. - For example, consider the following sequence of events ("key@n" indicates key at seqno "n") - insert k1@1 to memtable m1 - ingest file s1 with k2@2, ingest file s2 with k3@3 - insert k4@4 to m1 - compact files s1, s2 and result in new file s3 of seqno range [2, 3] - flush m1 and result in new file s4 of seqno range [1, 4]. And `force_consistency_check=true` will think s4 and s3 has file reordering corruption that might cause retuning an old value of k1 - However such caught corruption is a false positive since s1, s2 will not have overlapped keys with k1 or whatever inserted into m1 before ingest file s1 by the requirement of file ingestion (otherwise the m1 will be flushed first before any of the file ingestion completes). Therefore there in fact isn't any file reordering corruption. - Single delete can decrease a file's largest seqno and ordering by `largest_seqno` can introduce a wrong ordering hence file reordering corruption - For example, consider the following sequence of events ("key@n" indicates key at seqno "n", Credit to ajkr for this example) - an existing SST s1 contains only k1@1 - insert k1@2 to memtable m1 - ingest file s2 with k3@3, ingest file s3 with k4@4 - insert single delete k5@5 in m1 - flush m1 and result in new file s4 of seqno range [2, 5] - compact s1, s2, s3 and result in new file s5 of seqno range [1, 4] - compact s4 and result in new file s6 of seqno range [2] due to single delete - By the last step, we have file ordering by largest seqno (">" means "newer") : s5 > s6 while s6 contains a newer version of the k1's value (i.e, k1@2) than s5, which is a real reordering corruption. While this can be caught by `force_consistency_check=true`, there isn't a good way to prevent this from happening if ordering by `largest_seqno` Therefore, we are redesigning the sorting criteria of L0 files and avoid above inconvenience. Credit to ajkr , we now introduce `epoch_num` which describes the order of a file being flushed or ingested/imported (compaction output file will has the minimum `epoch_num` among input files'). This will avoid the above inconvenience in the following ways: - In the first case above, there will no longer be overlap seqno range check in `force_consistency_check=true` but `epoch_number` ordering check. This will result in file ordering s1 < s2 < s4 (pre-compaction) and s3 < s4 (post-compaction) which won't trigger false positive corruption. See test class `DBCompactionTestL0FilesMisorderCorruption*` for more. - In the second case above, this will result in file ordering s1 < s2 < s3 < s4 (pre-compacting s1, s2, s3), s5 < s4 (post-compacting s1, s2, s3), s5 < s6 (post-compacting s4), which are correct file ordering without causing any corruption. **Summary:** - Introduce `epoch_number` stored per `ColumnFamilyData` and sort CF's L0 files by their assigned `epoch_number` instead of `largest_seqno`. - `epoch_number` is increased and assigned upon `VersionEdit::AddFile()` for flush (or similarly for WriteLevel0TableForRecovery) and file ingestion (except for allow_behind_true, which will always get assigned as the `kReservedEpochNumberForFileIngestedBehind`) - Compaction output file is assigned with the minimum `epoch_number` among input files' - Refit level: reuse refitted file's epoch_number - Other paths needing `epoch_number` treatment: - Import column families: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo` - Repair: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`. - Assigning new epoch_number to a file and adding this file to LSM tree should be atomic. This is guaranteed by us assigning epoch_number right upon `VersionEdit::AddFile()` where this version edit will be apply to LSM tree shape right after by holding the db mutex (e.g, flush, file ingestion, import column family) or by there is only 1 ongoing edit per CF (e.g, WriteLevel0TableForRecovery, Repair). - Assigning the minimum input epoch number to compaction output file won't misorder L0 files (even through later `Refit(target_level=0)`). It's due to for every key "k" in the input range, a legit compaction will cover a continuous epoch number range of that key. As long as we assign the key "k" the minimum input epoch number, it won't become newer or older than the versions of this key that aren't included in this compaction hence no misorder. - Persist `epoch_number` of each file in manifest and recover `epoch_number` on db recovery - Backward compatibility with old db without `epoch_number` support is guaranteed by assigning `epoch_number` to recovered files by `NewestFirstBySeqno` order. See `VersionStorageInfo::RecoverEpochNumbers()` for more - Forward compatibility with manifest is guaranteed by flexibility of `NewFileCustomTag` - Replace `force_consistent_check` on L0 with `epoch_number` and remove false positive check like case 1 with `largest_seqno` above - Due to backward compatibility issue, we might encounter files with missing epoch number at the beginning of db recovery. We will still use old L0 sorting mechanism (`NewestFirstBySeqno`) to check/sort them till we infer their epoch number. See usages of `EpochNumberRequirement`. - Remove fix https://github.com/facebook/rocksdb/pull/5958#issue-511150930 and their outdated tests to file reordering corruption because such fix can be replaced by this PR. - Misc: - update existing tests with `epoch_number` so make check will pass - update https://github.com/facebook/rocksdb/pull/5958#issue-511150930 tests to verify corruption is fixed using `epoch_number` and cover universal/fifo compaction/CompactRange/CompactFile cases - assert db_mutex is held for a few places before calling ColumnFamilyData::NewEpochNumber() Pull Request resolved: https://github.com/facebook/rocksdb/pull/10922 Test Plan: - `make check` - New unit tests under `db/db_compaction_test.cc`, `db/db_test2.cc`, `db/version_builder_test.cc`, `db/repair_test.cc` - Updated tests (i.e, `DBCompactionTestL0FilesMisorderCorruption*`) under https://github.com/facebook/rocksdb/pull/5958#issue-511150930 - [Ongoing] Compatibility test: manually run https://github.com/ajkr/rocksdb/commit/36a5686ec012f35a4371e409aa85c404ca1c210d (with file ingestion off for running the `.orig` binary to prevent this bug affecting upgrade/downgrade formality checking) for 1 hour on `simple black/white box`, `cf_consistency/txn/enable_ts with whitebox + test_best_efforts_recovery with blackbox` - [Ongoing] normal db stress test - [Ongoing] db stress test with aggressive value https://github.com/facebook/rocksdb/pull/10761 Reviewed By: ajkr Differential Revision: D41063187 Pulled By: hx235 fbshipit-source-id: 826cb23455de7beaabe2d16c57682a82733a32a9
2022-12-13 21:29:37 +00:00
EpochNumberRequirement epoch_number_requirement =
EpochNumberRequirement::kMustPresent)
Group rocksdb.sst.read.micros stat by IOActivity flush and compaction (#11288) Summary: **Context:** The existing stat rocksdb.sst.read.micros does not reflect each of compaction and flush cases but aggregate them, which is not so helpful for us to understand IO read behavior of each of them. **Summary** - Update `StopWatch` and `RandomAccessFileReader` to record `rocksdb.sst.read.micros` and `rocksdb.file.{flush/compaction}.read.micros` - Fixed the default histogram in `RandomAccessFileReader` - New field `ReadOptions/IOOptions::io_activity`; Pass `ReadOptions` through paths under db open, flush and compaction to where we can prepare `IOOptions` and pass it to `RandomAccessFileReader` - Use `thread_status_util` for assertion in `DbStressFSWrapper` for continuous testing on we are passing correct `io_activity` under db open, flush and compaction Pull Request resolved: https://github.com/facebook/rocksdb/pull/11288 Test Plan: - **Stress test** - **Db bench 1: rocksdb.sst.read.micros COUNT ≈ sum of rocksdb.file.read.flush.micros's and rocksdb.file.read.compaction.micros's.** (without blob) - May not be exactly the same due to `HistogramStat::Add` only guarantees atomic not accuracy across threads. ``` ./db_bench -db=/dev/shm/testdb/ -statistics=true -benchmarks="fillseq" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -target_file_size_base=655 -disable_auto_compactions=false -compression_type=none -bloom_bits=3 (-use_plain_table=1 -prefix_size=10) ``` ``` // BlockBasedTable rocksdb.sst.read.micros P50 : 2.009374 P95 : 4.968548 P99 : 8.110362 P100 : 43.000000 COUNT : 40456 SUM : 114805 rocksdb.file.read.flush.micros P50 : 1.871841 P95 : 3.872407 P99 : 5.540541 P100 : 43.000000 COUNT : 2250 SUM : 6116 rocksdb.file.read.compaction.micros P50 : 2.023109 P95 : 5.029149 P99 : 8.196910 P100 : 26.000000 COUNT : 38206 SUM : 108689 // PlainTable Does not apply ``` - **Db bench 2: performance** **Read** SETUP: db with 900 files ``` ./db_bench -db=/dev/shm/testdb/ -benchmarks="fillseq" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -disable_auto_compactions=true -target_file_size_base=655 -compression_type=none ```run till convergence ``` ./db_bench -seed=1678564177044286 -use_existing_db=true -db=/dev/shm/testdb -benchmarks=readrandom[-X60] -statistics=true -num=1000000 -disable_auto_compactions=true -compression_type=none -bloom_bits=3 ``` Pre-change `readrandom [AVG 60 runs] : 21568 (± 248) ops/sec` Post-change (no regression, -0.3%) `readrandom [AVG 60 runs] : 21486 (± 236) ops/sec` **Compaction/Flush**run till convergence ``` ./db_bench -db=/dev/shm/testdb2/ -seed=1678564177044286 -benchmarks="fillseq[-X60]" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -disable_auto_compactions=false -target_file_size_base=655 -compression_type=none rocksdb.sst.read.micros COUNT : 33820 rocksdb.sst.read.flush.micros COUNT : 1800 rocksdb.sst.read.compaction.micros COUNT : 32020 ``` Pre-change `fillseq [AVG 46 runs] : 1391 (± 214) ops/sec; 0.7 (± 0.1) MB/sec` Post-change (no regression, ~-0.4%) `fillseq [AVG 46 runs] : 1385 (± 216) ops/sec; 0.7 (± 0.1) MB/sec` Reviewed By: ajkr Differential Revision: D44007011 Pulled By: hx235 fbshipit-source-id: a54c89e4846dfc9a135389edf3f3eedfea257132
2023-04-21 16:07:18 +00:00
: VersionEditHandler(
read_only, column_families, version_set, track_missing_files,
no_error_if_files_missing, io_tracer, read_options,
/*skip_load_table_files=*/false, epoch_number_requirement) {}
~VersionEditHandler() override {}
const VersionEditParams& GetVersionEditParams() const {
return version_edit_params_;
}
bool HasMissingFiles() const;
void GetDbId(std::string* db_id) const {
if (db_id && version_edit_params_.HasDbId()) {
*db_id = version_edit_params_.GetDbId();
}
}
protected:
explicit VersionEditHandler(
bool read_only, std::vector<ColumnFamilyDescriptor> column_families,
VersionSet* version_set, bool track_missing_files,
bool no_error_if_files_missing,
Group rocksdb.sst.read.micros stat by IOActivity flush and compaction (#11288) Summary: **Context:** The existing stat rocksdb.sst.read.micros does not reflect each of compaction and flush cases but aggregate them, which is not so helpful for us to understand IO read behavior of each of them. **Summary** - Update `StopWatch` and `RandomAccessFileReader` to record `rocksdb.sst.read.micros` and `rocksdb.file.{flush/compaction}.read.micros` - Fixed the default histogram in `RandomAccessFileReader` - New field `ReadOptions/IOOptions::io_activity`; Pass `ReadOptions` through paths under db open, flush and compaction to where we can prepare `IOOptions` and pass it to `RandomAccessFileReader` - Use `thread_status_util` for assertion in `DbStressFSWrapper` for continuous testing on we are passing correct `io_activity` under db open, flush and compaction Pull Request resolved: https://github.com/facebook/rocksdb/pull/11288 Test Plan: - **Stress test** - **Db bench 1: rocksdb.sst.read.micros COUNT ≈ sum of rocksdb.file.read.flush.micros's and rocksdb.file.read.compaction.micros's.** (without blob) - May not be exactly the same due to `HistogramStat::Add` only guarantees atomic not accuracy across threads. ``` ./db_bench -db=/dev/shm/testdb/ -statistics=true -benchmarks="fillseq" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -target_file_size_base=655 -disable_auto_compactions=false -compression_type=none -bloom_bits=3 (-use_plain_table=1 -prefix_size=10) ``` ``` // BlockBasedTable rocksdb.sst.read.micros P50 : 2.009374 P95 : 4.968548 P99 : 8.110362 P100 : 43.000000 COUNT : 40456 SUM : 114805 rocksdb.file.read.flush.micros P50 : 1.871841 P95 : 3.872407 P99 : 5.540541 P100 : 43.000000 COUNT : 2250 SUM : 6116 rocksdb.file.read.compaction.micros P50 : 2.023109 P95 : 5.029149 P99 : 8.196910 P100 : 26.000000 COUNT : 38206 SUM : 108689 // PlainTable Does not apply ``` - **Db bench 2: performance** **Read** SETUP: db with 900 files ``` ./db_bench -db=/dev/shm/testdb/ -benchmarks="fillseq" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -disable_auto_compactions=true -target_file_size_base=655 -compression_type=none ```run till convergence ``` ./db_bench -seed=1678564177044286 -use_existing_db=true -db=/dev/shm/testdb -benchmarks=readrandom[-X60] -statistics=true -num=1000000 -disable_auto_compactions=true -compression_type=none -bloom_bits=3 ``` Pre-change `readrandom [AVG 60 runs] : 21568 (± 248) ops/sec` Post-change (no regression, -0.3%) `readrandom [AVG 60 runs] : 21486 (± 236) ops/sec` **Compaction/Flush**run till convergence ``` ./db_bench -db=/dev/shm/testdb2/ -seed=1678564177044286 -benchmarks="fillseq[-X60]" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -disable_auto_compactions=false -target_file_size_base=655 -compression_type=none rocksdb.sst.read.micros COUNT : 33820 rocksdb.sst.read.flush.micros COUNT : 1800 rocksdb.sst.read.compaction.micros COUNT : 32020 ``` Pre-change `fillseq [AVG 46 runs] : 1391 (± 214) ops/sec; 0.7 (± 0.1) MB/sec` Post-change (no regression, ~-0.4%) `fillseq [AVG 46 runs] : 1385 (± 216) ops/sec; 0.7 (± 0.1) MB/sec` Reviewed By: ajkr Differential Revision: D44007011 Pulled By: hx235 fbshipit-source-id: a54c89e4846dfc9a135389edf3f3eedfea257132
2023-04-21 16:07:18 +00:00
const std::shared_ptr<IOTracer>& io_tracer,
const ReadOptions& read_options, bool skip_load_table_files,
Sort L0 files by newly introduced epoch_num (#10922) Summary: **Context:** Sorting L0 files by `largest_seqno` has at least two inconvenience: - File ingestion and compaction involving ingested files can create files of overlapping seqno range with the existing files. `force_consistency_check=true` will catch such overlap seqno range even those harmless overlap. - For example, consider the following sequence of events ("key@n" indicates key at seqno "n") - insert k1@1 to memtable m1 - ingest file s1 with k2@2, ingest file s2 with k3@3 - insert k4@4 to m1 - compact files s1, s2 and result in new file s3 of seqno range [2, 3] - flush m1 and result in new file s4 of seqno range [1, 4]. And `force_consistency_check=true` will think s4 and s3 has file reordering corruption that might cause retuning an old value of k1 - However such caught corruption is a false positive since s1, s2 will not have overlapped keys with k1 or whatever inserted into m1 before ingest file s1 by the requirement of file ingestion (otherwise the m1 will be flushed first before any of the file ingestion completes). Therefore there in fact isn't any file reordering corruption. - Single delete can decrease a file's largest seqno and ordering by `largest_seqno` can introduce a wrong ordering hence file reordering corruption - For example, consider the following sequence of events ("key@n" indicates key at seqno "n", Credit to ajkr for this example) - an existing SST s1 contains only k1@1 - insert k1@2 to memtable m1 - ingest file s2 with k3@3, ingest file s3 with k4@4 - insert single delete k5@5 in m1 - flush m1 and result in new file s4 of seqno range [2, 5] - compact s1, s2, s3 and result in new file s5 of seqno range [1, 4] - compact s4 and result in new file s6 of seqno range [2] due to single delete - By the last step, we have file ordering by largest seqno (">" means "newer") : s5 > s6 while s6 contains a newer version of the k1's value (i.e, k1@2) than s5, which is a real reordering corruption. While this can be caught by `force_consistency_check=true`, there isn't a good way to prevent this from happening if ordering by `largest_seqno` Therefore, we are redesigning the sorting criteria of L0 files and avoid above inconvenience. Credit to ajkr , we now introduce `epoch_num` which describes the order of a file being flushed or ingested/imported (compaction output file will has the minimum `epoch_num` among input files'). This will avoid the above inconvenience in the following ways: - In the first case above, there will no longer be overlap seqno range check in `force_consistency_check=true` but `epoch_number` ordering check. This will result in file ordering s1 < s2 < s4 (pre-compaction) and s3 < s4 (post-compaction) which won't trigger false positive corruption. See test class `DBCompactionTestL0FilesMisorderCorruption*` for more. - In the second case above, this will result in file ordering s1 < s2 < s3 < s4 (pre-compacting s1, s2, s3), s5 < s4 (post-compacting s1, s2, s3), s5 < s6 (post-compacting s4), which are correct file ordering without causing any corruption. **Summary:** - Introduce `epoch_number` stored per `ColumnFamilyData` and sort CF's L0 files by their assigned `epoch_number` instead of `largest_seqno`. - `epoch_number` is increased and assigned upon `VersionEdit::AddFile()` for flush (or similarly for WriteLevel0TableForRecovery) and file ingestion (except for allow_behind_true, which will always get assigned as the `kReservedEpochNumberForFileIngestedBehind`) - Compaction output file is assigned with the minimum `epoch_number` among input files' - Refit level: reuse refitted file's epoch_number - Other paths needing `epoch_number` treatment: - Import column families: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo` - Repair: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`. - Assigning new epoch_number to a file and adding this file to LSM tree should be atomic. This is guaranteed by us assigning epoch_number right upon `VersionEdit::AddFile()` where this version edit will be apply to LSM tree shape right after by holding the db mutex (e.g, flush, file ingestion, import column family) or by there is only 1 ongoing edit per CF (e.g, WriteLevel0TableForRecovery, Repair). - Assigning the minimum input epoch number to compaction output file won't misorder L0 files (even through later `Refit(target_level=0)`). It's due to for every key "k" in the input range, a legit compaction will cover a continuous epoch number range of that key. As long as we assign the key "k" the minimum input epoch number, it won't become newer or older than the versions of this key that aren't included in this compaction hence no misorder. - Persist `epoch_number` of each file in manifest and recover `epoch_number` on db recovery - Backward compatibility with old db without `epoch_number` support is guaranteed by assigning `epoch_number` to recovered files by `NewestFirstBySeqno` order. See `VersionStorageInfo::RecoverEpochNumbers()` for more - Forward compatibility with manifest is guaranteed by flexibility of `NewFileCustomTag` - Replace `force_consistent_check` on L0 with `epoch_number` and remove false positive check like case 1 with `largest_seqno` above - Due to backward compatibility issue, we might encounter files with missing epoch number at the beginning of db recovery. We will still use old L0 sorting mechanism (`NewestFirstBySeqno`) to check/sort them till we infer their epoch number. See usages of `EpochNumberRequirement`. - Remove fix https://github.com/facebook/rocksdb/pull/5958#issue-511150930 and their outdated tests to file reordering corruption because such fix can be replaced by this PR. - Misc: - update existing tests with `epoch_number` so make check will pass - update https://github.com/facebook/rocksdb/pull/5958#issue-511150930 tests to verify corruption is fixed using `epoch_number` and cover universal/fifo compaction/CompactRange/CompactFile cases - assert db_mutex is held for a few places before calling ColumnFamilyData::NewEpochNumber() Pull Request resolved: https://github.com/facebook/rocksdb/pull/10922 Test Plan: - `make check` - New unit tests under `db/db_compaction_test.cc`, `db/db_test2.cc`, `db/version_builder_test.cc`, `db/repair_test.cc` - Updated tests (i.e, `DBCompactionTestL0FilesMisorderCorruption*`) under https://github.com/facebook/rocksdb/pull/5958#issue-511150930 - [Ongoing] Compatibility test: manually run https://github.com/ajkr/rocksdb/commit/36a5686ec012f35a4371e409aa85c404ca1c210d (with file ingestion off for running the `.orig` binary to prevent this bug affecting upgrade/downgrade formality checking) for 1 hour on `simple black/white box`, `cf_consistency/txn/enable_ts with whitebox + test_best_efforts_recovery with blackbox` - [Ongoing] normal db stress test - [Ongoing] db stress test with aggressive value https://github.com/facebook/rocksdb/pull/10761 Reviewed By: ajkr Differential Revision: D41063187 Pulled By: hx235 fbshipit-source-id: 826cb23455de7beaabe2d16c57682a82733a32a9
2022-12-13 21:29:37 +00:00
EpochNumberRequirement epoch_number_requirement =
EpochNumberRequirement::kMustPresent);
Status ApplyVersionEdit(VersionEdit& edit, ColumnFamilyData** cfd) override;
virtual Status OnColumnFamilyAdd(VersionEdit& edit, ColumnFamilyData** cfd);
Status OnColumnFamilyDrop(VersionEdit& edit, ColumnFamilyData** cfd);
Status OnNonCfOperation(VersionEdit& edit, ColumnFamilyData** cfd);
Status OnWalAddition(VersionEdit& edit);
Status OnWalDeletion(VersionEdit& edit);
Status Initialize() override;
void CheckColumnFamilyId(const VersionEdit& edit, bool* cf_in_not_found,
bool* cf_in_builders) const;
void CheckIterationResult(const log::Reader& reader, Status* s) override;
ColumnFamilyData* CreateCfAndInit(const ColumnFamilyOptions& cf_options,
const VersionEdit& edit);
virtual ColumnFamilyData* DestroyCfAndCleanup(const VersionEdit& edit);
virtual Status MaybeCreateVersion(const VersionEdit& edit,
ColumnFamilyData* cfd,
bool force_create_version);
virtual Status LoadTables(ColumnFamilyData* cfd,
bool prefetch_index_and_filter_in_cache,
bool is_initial_load);
virtual bool MustOpenAllColumnFamilies() const { return !read_only_; }
const bool read_only_;
std::vector<ColumnFamilyDescriptor> column_families_;
VersionSet* version_set_;
std::unordered_map<uint32_t, VersionBuilderUPtr> builders_;
std::unordered_map<std::string, ColumnFamilyOptions> name_to_options_;
// Keeps track of column families in manifest that were not found in
// column families parameters. if those column families are not dropped
// by subsequent manifest records, Recover() will return failure status.
std::unordered_map<uint32_t, std::string> column_families_not_found_;
VersionEditParams version_edit_params_;
const bool track_missing_files_;
std::unordered_map<uint32_t, std::unordered_set<uint64_t>>
cf_to_missing_files_;
std::unordered_map<uint32_t, uint64_t> cf_to_missing_blob_files_high_;
bool no_error_if_files_missing_;
std::shared_ptr<IOTracer> io_tracer_;
bool skip_load_table_files_;
bool initialized_;
std::unique_ptr<std::unordered_map<uint32_t, std::string>> cf_to_cmp_names_;
Sort L0 files by newly introduced epoch_num (#10922) Summary: **Context:** Sorting L0 files by `largest_seqno` has at least two inconvenience: - File ingestion and compaction involving ingested files can create files of overlapping seqno range with the existing files. `force_consistency_check=true` will catch such overlap seqno range even those harmless overlap. - For example, consider the following sequence of events ("key@n" indicates key at seqno "n") - insert k1@1 to memtable m1 - ingest file s1 with k2@2, ingest file s2 with k3@3 - insert k4@4 to m1 - compact files s1, s2 and result in new file s3 of seqno range [2, 3] - flush m1 and result in new file s4 of seqno range [1, 4]. And `force_consistency_check=true` will think s4 and s3 has file reordering corruption that might cause retuning an old value of k1 - However such caught corruption is a false positive since s1, s2 will not have overlapped keys with k1 or whatever inserted into m1 before ingest file s1 by the requirement of file ingestion (otherwise the m1 will be flushed first before any of the file ingestion completes). Therefore there in fact isn't any file reordering corruption. - Single delete can decrease a file's largest seqno and ordering by `largest_seqno` can introduce a wrong ordering hence file reordering corruption - For example, consider the following sequence of events ("key@n" indicates key at seqno "n", Credit to ajkr for this example) - an existing SST s1 contains only k1@1 - insert k1@2 to memtable m1 - ingest file s2 with k3@3, ingest file s3 with k4@4 - insert single delete k5@5 in m1 - flush m1 and result in new file s4 of seqno range [2, 5] - compact s1, s2, s3 and result in new file s5 of seqno range [1, 4] - compact s4 and result in new file s6 of seqno range [2] due to single delete - By the last step, we have file ordering by largest seqno (">" means "newer") : s5 > s6 while s6 contains a newer version of the k1's value (i.e, k1@2) than s5, which is a real reordering corruption. While this can be caught by `force_consistency_check=true`, there isn't a good way to prevent this from happening if ordering by `largest_seqno` Therefore, we are redesigning the sorting criteria of L0 files and avoid above inconvenience. Credit to ajkr , we now introduce `epoch_num` which describes the order of a file being flushed or ingested/imported (compaction output file will has the minimum `epoch_num` among input files'). This will avoid the above inconvenience in the following ways: - In the first case above, there will no longer be overlap seqno range check in `force_consistency_check=true` but `epoch_number` ordering check. This will result in file ordering s1 < s2 < s4 (pre-compaction) and s3 < s4 (post-compaction) which won't trigger false positive corruption. See test class `DBCompactionTestL0FilesMisorderCorruption*` for more. - In the second case above, this will result in file ordering s1 < s2 < s3 < s4 (pre-compacting s1, s2, s3), s5 < s4 (post-compacting s1, s2, s3), s5 < s6 (post-compacting s4), which are correct file ordering without causing any corruption. **Summary:** - Introduce `epoch_number` stored per `ColumnFamilyData` and sort CF's L0 files by their assigned `epoch_number` instead of `largest_seqno`. - `epoch_number` is increased and assigned upon `VersionEdit::AddFile()` for flush (or similarly for WriteLevel0TableForRecovery) and file ingestion (except for allow_behind_true, which will always get assigned as the `kReservedEpochNumberForFileIngestedBehind`) - Compaction output file is assigned with the minimum `epoch_number` among input files' - Refit level: reuse refitted file's epoch_number - Other paths needing `epoch_number` treatment: - Import column families: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo` - Repair: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`. - Assigning new epoch_number to a file and adding this file to LSM tree should be atomic. This is guaranteed by us assigning epoch_number right upon `VersionEdit::AddFile()` where this version edit will be apply to LSM tree shape right after by holding the db mutex (e.g, flush, file ingestion, import column family) or by there is only 1 ongoing edit per CF (e.g, WriteLevel0TableForRecovery, Repair). - Assigning the minimum input epoch number to compaction output file won't misorder L0 files (even through later `Refit(target_level=0)`). It's due to for every key "k" in the input range, a legit compaction will cover a continuous epoch number range of that key. As long as we assign the key "k" the minimum input epoch number, it won't become newer or older than the versions of this key that aren't included in this compaction hence no misorder. - Persist `epoch_number` of each file in manifest and recover `epoch_number` on db recovery - Backward compatibility with old db without `epoch_number` support is guaranteed by assigning `epoch_number` to recovered files by `NewestFirstBySeqno` order. See `VersionStorageInfo::RecoverEpochNumbers()` for more - Forward compatibility with manifest is guaranteed by flexibility of `NewFileCustomTag` - Replace `force_consistent_check` on L0 with `epoch_number` and remove false positive check like case 1 with `largest_seqno` above - Due to backward compatibility issue, we might encounter files with missing epoch number at the beginning of db recovery. We will still use old L0 sorting mechanism (`NewestFirstBySeqno`) to check/sort them till we infer their epoch number. See usages of `EpochNumberRequirement`. - Remove fix https://github.com/facebook/rocksdb/pull/5958#issue-511150930 and their outdated tests to file reordering corruption because such fix can be replaced by this PR. - Misc: - update existing tests with `epoch_number` so make check will pass - update https://github.com/facebook/rocksdb/pull/5958#issue-511150930 tests to verify corruption is fixed using `epoch_number` and cover universal/fifo compaction/CompactRange/CompactFile cases - assert db_mutex is held for a few places before calling ColumnFamilyData::NewEpochNumber() Pull Request resolved: https://github.com/facebook/rocksdb/pull/10922 Test Plan: - `make check` - New unit tests under `db/db_compaction_test.cc`, `db/db_test2.cc`, `db/version_builder_test.cc`, `db/repair_test.cc` - Updated tests (i.e, `DBCompactionTestL0FilesMisorderCorruption*`) under https://github.com/facebook/rocksdb/pull/5958#issue-511150930 - [Ongoing] Compatibility test: manually run https://github.com/ajkr/rocksdb/commit/36a5686ec012f35a4371e409aa85c404ca1c210d (with file ingestion off for running the `.orig` binary to prevent this bug affecting upgrade/downgrade formality checking) for 1 hour on `simple black/white box`, `cf_consistency/txn/enable_ts with whitebox + test_best_efforts_recovery with blackbox` - [Ongoing] normal db stress test - [Ongoing] db stress test with aggressive value https://github.com/facebook/rocksdb/pull/10761 Reviewed By: ajkr Differential Revision: D41063187 Pulled By: hx235 fbshipit-source-id: 826cb23455de7beaabe2d16c57682a82733a32a9
2022-12-13 21:29:37 +00:00
EpochNumberRequirement epoch_number_requirement_;
Support switching on / off UDT together with in-Memtable-only feature (#11623) Summary: Add support to allow enabling / disabling user-defined timestamps feature for an existing column family in combination with the in-Memtable only feature. To do this, this PR includes: 1) Log the `persist_user_defined_timestamps` option per column family in Manifest to facilitate detecting an attempt to enable / disable UDT. This entry is enforced to be logged in the same VersionEdit as the user comparator name entry. 2) User-defined timestamps related options are validated when re-opening a column family, including user comparator name and the `persist_user_defined_timestamps` flag. These type of settings and settings change are considered valid: a) no user comparator change and no effective `persist_user_defined_timestamp` flag change. b) switch user comparator to enable UDT provided the immediately effective `persist_user_defined_timestamps` flag is false. c) switch user comparator to disable UDT provided that the before-change `persist_user_defined_timestamps` is already false. 3) when an attempt to enable UDT is detected, we mark all its existing SST files as "having no UDT" by marking its `FileMetaData.user_defined_timestamps_persisted` flag to false and handle their file boundaries `FileMetaData.smallest`, `FileMetaData.largest` by padding a min timestamp. 4) while enabling / disabling UDT feature, timestamp size inconsistency in existing WAL logs are handled to make it compatible with the running user comparator. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11623 Test Plan: ``` make all check ./db_with_timestamp_basic_test --gtest-filter="*EnableDisableUDT*" ./db_wal_test --gtest_filter="*EnableDisableUDT*" ``` Reviewed By: ltamasi Differential Revision: D47636862 Pulled By: jowlyzhang fbshipit-source-id: dcd19f67292da3c3cc9584c09ad00331c9ab9322
2023-07-27 03:16:32 +00:00
std::unordered_set<uint32_t> cfds_to_mark_no_udt_;
private:
Status ExtractInfoFromVersionEdit(ColumnFamilyData* cfd,
const VersionEdit& edit);
// When `FileMetaData.user_defined_timestamps_persisted` is false and
// user-defined timestamp size is non-zero. User-defined timestamps are
// stripped from file boundaries: `smallest`, `largest` in
// `VersionEdit.DecodeFrom` before they were written to Manifest.
// This is the mirroring change to handle file boundaries on the Manifest read
// path for this scenario: to pad a minimum timestamp to the user key in
// `smallest` and `largest` so their format are consistent with the running
// user comparator.
Status MaybeHandleFileBoundariesForNewFiles(VersionEdit& edit,
const ColumnFamilyData* cfd);
};
// A class similar to its base class, i.e. VersionEditHandler.
// VersionEditHandlerPointInTime restores the versions to the most recent point
// in time such that at this point, the version does not have missing files.
//
// Not thread-safe, external synchronization is necessary if an object of
// VersionEditHandlerPointInTime is shared by multiple threads.
class VersionEditHandlerPointInTime : public VersionEditHandler {
public:
VersionEditHandlerPointInTime(
bool read_only, std::vector<ColumnFamilyDescriptor> column_families,
Sort L0 files by newly introduced epoch_num (#10922) Summary: **Context:** Sorting L0 files by `largest_seqno` has at least two inconvenience: - File ingestion and compaction involving ingested files can create files of overlapping seqno range with the existing files. `force_consistency_check=true` will catch such overlap seqno range even those harmless overlap. - For example, consider the following sequence of events ("key@n" indicates key at seqno "n") - insert k1@1 to memtable m1 - ingest file s1 with k2@2, ingest file s2 with k3@3 - insert k4@4 to m1 - compact files s1, s2 and result in new file s3 of seqno range [2, 3] - flush m1 and result in new file s4 of seqno range [1, 4]. And `force_consistency_check=true` will think s4 and s3 has file reordering corruption that might cause retuning an old value of k1 - However such caught corruption is a false positive since s1, s2 will not have overlapped keys with k1 or whatever inserted into m1 before ingest file s1 by the requirement of file ingestion (otherwise the m1 will be flushed first before any of the file ingestion completes). Therefore there in fact isn't any file reordering corruption. - Single delete can decrease a file's largest seqno and ordering by `largest_seqno` can introduce a wrong ordering hence file reordering corruption - For example, consider the following sequence of events ("key@n" indicates key at seqno "n", Credit to ajkr for this example) - an existing SST s1 contains only k1@1 - insert k1@2 to memtable m1 - ingest file s2 with k3@3, ingest file s3 with k4@4 - insert single delete k5@5 in m1 - flush m1 and result in new file s4 of seqno range [2, 5] - compact s1, s2, s3 and result in new file s5 of seqno range [1, 4] - compact s4 and result in new file s6 of seqno range [2] due to single delete - By the last step, we have file ordering by largest seqno (">" means "newer") : s5 > s6 while s6 contains a newer version of the k1's value (i.e, k1@2) than s5, which is a real reordering corruption. While this can be caught by `force_consistency_check=true`, there isn't a good way to prevent this from happening if ordering by `largest_seqno` Therefore, we are redesigning the sorting criteria of L0 files and avoid above inconvenience. Credit to ajkr , we now introduce `epoch_num` which describes the order of a file being flushed or ingested/imported (compaction output file will has the minimum `epoch_num` among input files'). This will avoid the above inconvenience in the following ways: - In the first case above, there will no longer be overlap seqno range check in `force_consistency_check=true` but `epoch_number` ordering check. This will result in file ordering s1 < s2 < s4 (pre-compaction) and s3 < s4 (post-compaction) which won't trigger false positive corruption. See test class `DBCompactionTestL0FilesMisorderCorruption*` for more. - In the second case above, this will result in file ordering s1 < s2 < s3 < s4 (pre-compacting s1, s2, s3), s5 < s4 (post-compacting s1, s2, s3), s5 < s6 (post-compacting s4), which are correct file ordering without causing any corruption. **Summary:** - Introduce `epoch_number` stored per `ColumnFamilyData` and sort CF's L0 files by their assigned `epoch_number` instead of `largest_seqno`. - `epoch_number` is increased and assigned upon `VersionEdit::AddFile()` for flush (or similarly for WriteLevel0TableForRecovery) and file ingestion (except for allow_behind_true, which will always get assigned as the `kReservedEpochNumberForFileIngestedBehind`) - Compaction output file is assigned with the minimum `epoch_number` among input files' - Refit level: reuse refitted file's epoch_number - Other paths needing `epoch_number` treatment: - Import column families: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo` - Repair: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`. - Assigning new epoch_number to a file and adding this file to LSM tree should be atomic. This is guaranteed by us assigning epoch_number right upon `VersionEdit::AddFile()` where this version edit will be apply to LSM tree shape right after by holding the db mutex (e.g, flush, file ingestion, import column family) or by there is only 1 ongoing edit per CF (e.g, WriteLevel0TableForRecovery, Repair). - Assigning the minimum input epoch number to compaction output file won't misorder L0 files (even through later `Refit(target_level=0)`). It's due to for every key "k" in the input range, a legit compaction will cover a continuous epoch number range of that key. As long as we assign the key "k" the minimum input epoch number, it won't become newer or older than the versions of this key that aren't included in this compaction hence no misorder. - Persist `epoch_number` of each file in manifest and recover `epoch_number` on db recovery - Backward compatibility with old db without `epoch_number` support is guaranteed by assigning `epoch_number` to recovered files by `NewestFirstBySeqno` order. See `VersionStorageInfo::RecoverEpochNumbers()` for more - Forward compatibility with manifest is guaranteed by flexibility of `NewFileCustomTag` - Replace `force_consistent_check` on L0 with `epoch_number` and remove false positive check like case 1 with `largest_seqno` above - Due to backward compatibility issue, we might encounter files with missing epoch number at the beginning of db recovery. We will still use old L0 sorting mechanism (`NewestFirstBySeqno`) to check/sort them till we infer their epoch number. See usages of `EpochNumberRequirement`. - Remove fix https://github.com/facebook/rocksdb/pull/5958#issue-511150930 and their outdated tests to file reordering corruption because such fix can be replaced by this PR. - Misc: - update existing tests with `epoch_number` so make check will pass - update https://github.com/facebook/rocksdb/pull/5958#issue-511150930 tests to verify corruption is fixed using `epoch_number` and cover universal/fifo compaction/CompactRange/CompactFile cases - assert db_mutex is held for a few places before calling ColumnFamilyData::NewEpochNumber() Pull Request resolved: https://github.com/facebook/rocksdb/pull/10922 Test Plan: - `make check` - New unit tests under `db/db_compaction_test.cc`, `db/db_test2.cc`, `db/version_builder_test.cc`, `db/repair_test.cc` - Updated tests (i.e, `DBCompactionTestL0FilesMisorderCorruption*`) under https://github.com/facebook/rocksdb/pull/5958#issue-511150930 - [Ongoing] Compatibility test: manually run https://github.com/ajkr/rocksdb/commit/36a5686ec012f35a4371e409aa85c404ca1c210d (with file ingestion off for running the `.orig` binary to prevent this bug affecting upgrade/downgrade formality checking) for 1 hour on `simple black/white box`, `cf_consistency/txn/enable_ts with whitebox + test_best_efforts_recovery with blackbox` - [Ongoing] normal db stress test - [Ongoing] db stress test with aggressive value https://github.com/facebook/rocksdb/pull/10761 Reviewed By: ajkr Differential Revision: D41063187 Pulled By: hx235 fbshipit-source-id: 826cb23455de7beaabe2d16c57682a82733a32a9
2022-12-13 21:29:37 +00:00
VersionSet* version_set, const std::shared_ptr<IOTracer>& io_tracer,
Group rocksdb.sst.read.micros stat by IOActivity flush and compaction (#11288) Summary: **Context:** The existing stat rocksdb.sst.read.micros does not reflect each of compaction and flush cases but aggregate them, which is not so helpful for us to understand IO read behavior of each of them. **Summary** - Update `StopWatch` and `RandomAccessFileReader` to record `rocksdb.sst.read.micros` and `rocksdb.file.{flush/compaction}.read.micros` - Fixed the default histogram in `RandomAccessFileReader` - New field `ReadOptions/IOOptions::io_activity`; Pass `ReadOptions` through paths under db open, flush and compaction to where we can prepare `IOOptions` and pass it to `RandomAccessFileReader` - Use `thread_status_util` for assertion in `DbStressFSWrapper` for continuous testing on we are passing correct `io_activity` under db open, flush and compaction Pull Request resolved: https://github.com/facebook/rocksdb/pull/11288 Test Plan: - **Stress test** - **Db bench 1: rocksdb.sst.read.micros COUNT ≈ sum of rocksdb.file.read.flush.micros's and rocksdb.file.read.compaction.micros's.** (without blob) - May not be exactly the same due to `HistogramStat::Add` only guarantees atomic not accuracy across threads. ``` ./db_bench -db=/dev/shm/testdb/ -statistics=true -benchmarks="fillseq" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -target_file_size_base=655 -disable_auto_compactions=false -compression_type=none -bloom_bits=3 (-use_plain_table=1 -prefix_size=10) ``` ``` // BlockBasedTable rocksdb.sst.read.micros P50 : 2.009374 P95 : 4.968548 P99 : 8.110362 P100 : 43.000000 COUNT : 40456 SUM : 114805 rocksdb.file.read.flush.micros P50 : 1.871841 P95 : 3.872407 P99 : 5.540541 P100 : 43.000000 COUNT : 2250 SUM : 6116 rocksdb.file.read.compaction.micros P50 : 2.023109 P95 : 5.029149 P99 : 8.196910 P100 : 26.000000 COUNT : 38206 SUM : 108689 // PlainTable Does not apply ``` - **Db bench 2: performance** **Read** SETUP: db with 900 files ``` ./db_bench -db=/dev/shm/testdb/ -benchmarks="fillseq" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -disable_auto_compactions=true -target_file_size_base=655 -compression_type=none ```run till convergence ``` ./db_bench -seed=1678564177044286 -use_existing_db=true -db=/dev/shm/testdb -benchmarks=readrandom[-X60] -statistics=true -num=1000000 -disable_auto_compactions=true -compression_type=none -bloom_bits=3 ``` Pre-change `readrandom [AVG 60 runs] : 21568 (± 248) ops/sec` Post-change (no regression, -0.3%) `readrandom [AVG 60 runs] : 21486 (± 236) ops/sec` **Compaction/Flush**run till convergence ``` ./db_bench -db=/dev/shm/testdb2/ -seed=1678564177044286 -benchmarks="fillseq[-X60]" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -disable_auto_compactions=false -target_file_size_base=655 -compression_type=none rocksdb.sst.read.micros COUNT : 33820 rocksdb.sst.read.flush.micros COUNT : 1800 rocksdb.sst.read.compaction.micros COUNT : 32020 ``` Pre-change `fillseq [AVG 46 runs] : 1391 (± 214) ops/sec; 0.7 (± 0.1) MB/sec` Post-change (no regression, ~-0.4%) `fillseq [AVG 46 runs] : 1385 (± 216) ops/sec; 0.7 (± 0.1) MB/sec` Reviewed By: ajkr Differential Revision: D44007011 Pulled By: hx235 fbshipit-source-id: a54c89e4846dfc9a135389edf3f3eedfea257132
2023-04-21 16:07:18 +00:00
const ReadOptions& read_options,
Sort L0 files by newly introduced epoch_num (#10922) Summary: **Context:** Sorting L0 files by `largest_seqno` has at least two inconvenience: - File ingestion and compaction involving ingested files can create files of overlapping seqno range with the existing files. `force_consistency_check=true` will catch such overlap seqno range even those harmless overlap. - For example, consider the following sequence of events ("key@n" indicates key at seqno "n") - insert k1@1 to memtable m1 - ingest file s1 with k2@2, ingest file s2 with k3@3 - insert k4@4 to m1 - compact files s1, s2 and result in new file s3 of seqno range [2, 3] - flush m1 and result in new file s4 of seqno range [1, 4]. And `force_consistency_check=true` will think s4 and s3 has file reordering corruption that might cause retuning an old value of k1 - However such caught corruption is a false positive since s1, s2 will not have overlapped keys with k1 or whatever inserted into m1 before ingest file s1 by the requirement of file ingestion (otherwise the m1 will be flushed first before any of the file ingestion completes). Therefore there in fact isn't any file reordering corruption. - Single delete can decrease a file's largest seqno and ordering by `largest_seqno` can introduce a wrong ordering hence file reordering corruption - For example, consider the following sequence of events ("key@n" indicates key at seqno "n", Credit to ajkr for this example) - an existing SST s1 contains only k1@1 - insert k1@2 to memtable m1 - ingest file s2 with k3@3, ingest file s3 with k4@4 - insert single delete k5@5 in m1 - flush m1 and result in new file s4 of seqno range [2, 5] - compact s1, s2, s3 and result in new file s5 of seqno range [1, 4] - compact s4 and result in new file s6 of seqno range [2] due to single delete - By the last step, we have file ordering by largest seqno (">" means "newer") : s5 > s6 while s6 contains a newer version of the k1's value (i.e, k1@2) than s5, which is a real reordering corruption. While this can be caught by `force_consistency_check=true`, there isn't a good way to prevent this from happening if ordering by `largest_seqno` Therefore, we are redesigning the sorting criteria of L0 files and avoid above inconvenience. Credit to ajkr , we now introduce `epoch_num` which describes the order of a file being flushed or ingested/imported (compaction output file will has the minimum `epoch_num` among input files'). This will avoid the above inconvenience in the following ways: - In the first case above, there will no longer be overlap seqno range check in `force_consistency_check=true` but `epoch_number` ordering check. This will result in file ordering s1 < s2 < s4 (pre-compaction) and s3 < s4 (post-compaction) which won't trigger false positive corruption. See test class `DBCompactionTestL0FilesMisorderCorruption*` for more. - In the second case above, this will result in file ordering s1 < s2 < s3 < s4 (pre-compacting s1, s2, s3), s5 < s4 (post-compacting s1, s2, s3), s5 < s6 (post-compacting s4), which are correct file ordering without causing any corruption. **Summary:** - Introduce `epoch_number` stored per `ColumnFamilyData` and sort CF's L0 files by their assigned `epoch_number` instead of `largest_seqno`. - `epoch_number` is increased and assigned upon `VersionEdit::AddFile()` for flush (or similarly for WriteLevel0TableForRecovery) and file ingestion (except for allow_behind_true, which will always get assigned as the `kReservedEpochNumberForFileIngestedBehind`) - Compaction output file is assigned with the minimum `epoch_number` among input files' - Refit level: reuse refitted file's epoch_number - Other paths needing `epoch_number` treatment: - Import column families: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo` - Repair: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`. - Assigning new epoch_number to a file and adding this file to LSM tree should be atomic. This is guaranteed by us assigning epoch_number right upon `VersionEdit::AddFile()` where this version edit will be apply to LSM tree shape right after by holding the db mutex (e.g, flush, file ingestion, import column family) or by there is only 1 ongoing edit per CF (e.g, WriteLevel0TableForRecovery, Repair). - Assigning the minimum input epoch number to compaction output file won't misorder L0 files (even through later `Refit(target_level=0)`). It's due to for every key "k" in the input range, a legit compaction will cover a continuous epoch number range of that key. As long as we assign the key "k" the minimum input epoch number, it won't become newer or older than the versions of this key that aren't included in this compaction hence no misorder. - Persist `epoch_number` of each file in manifest and recover `epoch_number` on db recovery - Backward compatibility with old db without `epoch_number` support is guaranteed by assigning `epoch_number` to recovered files by `NewestFirstBySeqno` order. See `VersionStorageInfo::RecoverEpochNumbers()` for more - Forward compatibility with manifest is guaranteed by flexibility of `NewFileCustomTag` - Replace `force_consistent_check` on L0 with `epoch_number` and remove false positive check like case 1 with `largest_seqno` above - Due to backward compatibility issue, we might encounter files with missing epoch number at the beginning of db recovery. We will still use old L0 sorting mechanism (`NewestFirstBySeqno`) to check/sort them till we infer their epoch number. See usages of `EpochNumberRequirement`. - Remove fix https://github.com/facebook/rocksdb/pull/5958#issue-511150930 and their outdated tests to file reordering corruption because such fix can be replaced by this PR. - Misc: - update existing tests with `epoch_number` so make check will pass - update https://github.com/facebook/rocksdb/pull/5958#issue-511150930 tests to verify corruption is fixed using `epoch_number` and cover universal/fifo compaction/CompactRange/CompactFile cases - assert db_mutex is held for a few places before calling ColumnFamilyData::NewEpochNumber() Pull Request resolved: https://github.com/facebook/rocksdb/pull/10922 Test Plan: - `make check` - New unit tests under `db/db_compaction_test.cc`, `db/db_test2.cc`, `db/version_builder_test.cc`, `db/repair_test.cc` - Updated tests (i.e, `DBCompactionTestL0FilesMisorderCorruption*`) under https://github.com/facebook/rocksdb/pull/5958#issue-511150930 - [Ongoing] Compatibility test: manually run https://github.com/ajkr/rocksdb/commit/36a5686ec012f35a4371e409aa85c404ca1c210d (with file ingestion off for running the `.orig` binary to prevent this bug affecting upgrade/downgrade formality checking) for 1 hour on `simple black/white box`, `cf_consistency/txn/enable_ts with whitebox + test_best_efforts_recovery with blackbox` - [Ongoing] normal db stress test - [Ongoing] db stress test with aggressive value https://github.com/facebook/rocksdb/pull/10761 Reviewed By: ajkr Differential Revision: D41063187 Pulled By: hx235 fbshipit-source-id: 826cb23455de7beaabe2d16c57682a82733a32a9
2022-12-13 21:29:37 +00:00
EpochNumberRequirement epoch_number_requirement =
EpochNumberRequirement::kMustPresent);
~VersionEditHandlerPointInTime() override;
protected:
Best-effort recovery support for atomic flush (#12406) Summary: This PR updates `VersionEditHandlerPointInTime` to recover all or none of the updates in an AtomicGroup. This makes best-effort recovery properly handle atomic flushes during recovery, so the features are now allowed to both be enabled at once. The new logic requires that AtomicGroups do not contain column family additions or removals. AtomicGroups are currently written for atomic flush, which does not include such edits. Column family additions or removals are recovered independently of AtomicGroups. The new logic needs to be aware of removal, though, so that a dropped CF does not prevent completion of an AtomicGroup recovery. The new logic treats each AtomicGroup as if it contains updates for all existing column families, even though it is possible to create AtomicGroups that only affect a subset of column families. This simplifies the logic at the expense of recovering less data in certain edge case scenarios. The usage of `MaybeCreateVersion()` is pretty tricky. The goal is to create a barrier at the start of an AtomicGroup such that all valid states up to that point will be applied to `versions_`. Here is a summary. - `MaybeCreateVersion(..., false)` creates a `Version` on a negative edge trigger (transition from valid to invalid). It was previously called when applying each update. Now, it is only called when applying non-AtomicGroup updates. - `MaybeCreateVersion(..., true)` creates a `Version` on a positive level trigger (valid state). It was previously called only at the end of iteration. Now, it is additionally called before processing an AtomicGroup. Pull Request resolved: https://github.com/facebook/rocksdb/pull/12406 Reviewed By: jaykorean, cbi42 Differential Revision: D54494904 Pulled By: ajkr fbshipit-source-id: 0114a9fe1d04b471d086dcab5978ea8a3a56ad52
2024-03-06 22:40:40 +00:00
Status OnAtomicGroupReplayBegin() override;
Status OnAtomicGroupReplayEnd() override;
void CheckIterationResult(const log::Reader& reader, Status* s) override;
Best-effort recovery support for atomic flush (#12406) Summary: This PR updates `VersionEditHandlerPointInTime` to recover all or none of the updates in an AtomicGroup. This makes best-effort recovery properly handle atomic flushes during recovery, so the features are now allowed to both be enabled at once. The new logic requires that AtomicGroups do not contain column family additions or removals. AtomicGroups are currently written for atomic flush, which does not include such edits. Column family additions or removals are recovered independently of AtomicGroups. The new logic needs to be aware of removal, though, so that a dropped CF does not prevent completion of an AtomicGroup recovery. The new logic treats each AtomicGroup as if it contains updates for all existing column families, even though it is possible to create AtomicGroups that only affect a subset of column families. This simplifies the logic at the expense of recovering less data in certain edge case scenarios. The usage of `MaybeCreateVersion()` is pretty tricky. The goal is to create a barrier at the start of an AtomicGroup such that all valid states up to that point will be applied to `versions_`. Here is a summary. - `MaybeCreateVersion(..., false)` creates a `Version` on a negative edge trigger (transition from valid to invalid). It was previously called when applying each update. Now, it is only called when applying non-AtomicGroup updates. - `MaybeCreateVersion(..., true)` creates a `Version` on a positive level trigger (valid state). It was previously called only at the end of iteration. Now, it is additionally called before processing an AtomicGroup. Pull Request resolved: https://github.com/facebook/rocksdb/pull/12406 Reviewed By: jaykorean, cbi42 Differential Revision: D54494904 Pulled By: ajkr fbshipit-source-id: 0114a9fe1d04b471d086dcab5978ea8a3a56ad52
2024-03-06 22:40:40 +00:00
ColumnFamilyData* DestroyCfAndCleanup(const VersionEdit& edit) override;
Best-effort recovery support for atomic flush (#12406) Summary: This PR updates `VersionEditHandlerPointInTime` to recover all or none of the updates in an AtomicGroup. This makes best-effort recovery properly handle atomic flushes during recovery, so the features are now allowed to both be enabled at once. The new logic requires that AtomicGroups do not contain column family additions or removals. AtomicGroups are currently written for atomic flush, which does not include such edits. Column family additions or removals are recovered independently of AtomicGroups. The new logic needs to be aware of removal, though, so that a dropped CF does not prevent completion of an AtomicGroup recovery. The new logic treats each AtomicGroup as if it contains updates for all existing column families, even though it is possible to create AtomicGroups that only affect a subset of column families. This simplifies the logic at the expense of recovering less data in certain edge case scenarios. The usage of `MaybeCreateVersion()` is pretty tricky. The goal is to create a barrier at the start of an AtomicGroup such that all valid states up to that point will be applied to `versions_`. Here is a summary. - `MaybeCreateVersion(..., false)` creates a `Version` on a negative edge trigger (transition from valid to invalid). It was previously called when applying each update. Now, it is only called when applying non-AtomicGroup updates. - `MaybeCreateVersion(..., true)` creates a `Version` on a positive level trigger (valid state). It was previously called only at the end of iteration. Now, it is additionally called before processing an AtomicGroup. Pull Request resolved: https://github.com/facebook/rocksdb/pull/12406 Reviewed By: jaykorean, cbi42 Differential Revision: D54494904 Pulled By: ajkr fbshipit-source-id: 0114a9fe1d04b471d086dcab5978ea8a3a56ad52
2024-03-06 22:40:40 +00:00
// `MaybeCreateVersion(..., false)` creates a version upon a negative edge
// trigger (transition from valid to invalid).
//
// `MaybeCreateVersion(..., true)` creates a version on a positive level
// trigger (state is valid).
Status MaybeCreateVersion(const VersionEdit& edit, ColumnFamilyData* cfd,
bool force_create_version) override;
virtual Status VerifyFile(ColumnFamilyData* cfd, const std::string& fpath,
int level, const FileMetaData& fmeta);
virtual Status VerifyBlobFile(ColumnFamilyData* cfd, uint64_t blob_file_num,
const BlobFileAddition& blob_addition);
Status LoadTables(ColumnFamilyData* cfd,
bool prefetch_index_and_filter_in_cache,
bool is_initial_load) override;
std::unordered_map<uint32_t, Version*> versions_;
Best-effort recovery support for atomic flush (#12406) Summary: This PR updates `VersionEditHandlerPointInTime` to recover all or none of the updates in an AtomicGroup. This makes best-effort recovery properly handle atomic flushes during recovery, so the features are now allowed to both be enabled at once. The new logic requires that AtomicGroups do not contain column family additions or removals. AtomicGroups are currently written for atomic flush, which does not include such edits. Column family additions or removals are recovered independently of AtomicGroups. The new logic needs to be aware of removal, though, so that a dropped CF does not prevent completion of an AtomicGroup recovery. The new logic treats each AtomicGroup as if it contains updates for all existing column families, even though it is possible to create AtomicGroups that only affect a subset of column families. This simplifies the logic at the expense of recovering less data in certain edge case scenarios. The usage of `MaybeCreateVersion()` is pretty tricky. The goal is to create a barrier at the start of an AtomicGroup such that all valid states up to that point will be applied to `versions_`. Here is a summary. - `MaybeCreateVersion(..., false)` creates a `Version` on a negative edge trigger (transition from valid to invalid). It was previously called when applying each update. Now, it is only called when applying non-AtomicGroup updates. - `MaybeCreateVersion(..., true)` creates a `Version` on a positive level trigger (valid state). It was previously called only at the end of iteration. Now, it is additionally called before processing an AtomicGroup. Pull Request resolved: https://github.com/facebook/rocksdb/pull/12406 Reviewed By: jaykorean, cbi42 Differential Revision: D54494904 Pulled By: ajkr fbshipit-source-id: 0114a9fe1d04b471d086dcab5978ea8a3a56ad52
2024-03-06 22:40:40 +00:00
// `atomic_update_versions_` is for ensuring all-or-nothing AtomicGroup
// recoveries. When `atomic_update_versions_` is nonempty, it serves as a
// barrier to updating `versions_` until all its values are populated.
std::unordered_map<uint32_t, Version*> atomic_update_versions_;
// `atomic_update_versions_missing_` counts the nullptr values in
// `atomic_update_versions_`.
size_t atomic_update_versions_missing_;
bool in_atomic_group_ = false;
private:
bool AtomicUpdateVersionsCompleted();
bool AtomicUpdateVersionsContains(uint32_t cfid);
void AtomicUpdateVersionsDropCf(uint32_t cfid);
// This function is called for `Version*` updates for column families in an
// incomplete atomic update. It buffers `Version*` updates in
// `atomic_update_versions_`.
void AtomicUpdateVersionsPut(Version* version);
// This function is called upon completion of an atomic update. It applies
// `Version*` updates in `atomic_update_versions_` to `versions_`.
void AtomicUpdateVersionsApply();
};
class ManifestTailer : public VersionEditHandlerPointInTime {
public:
explicit ManifestTailer(std::vector<ColumnFamilyDescriptor> column_families,
VersionSet* version_set,
Sort L0 files by newly introduced epoch_num (#10922) Summary: **Context:** Sorting L0 files by `largest_seqno` has at least two inconvenience: - File ingestion and compaction involving ingested files can create files of overlapping seqno range with the existing files. `force_consistency_check=true` will catch such overlap seqno range even those harmless overlap. - For example, consider the following sequence of events ("key@n" indicates key at seqno "n") - insert k1@1 to memtable m1 - ingest file s1 with k2@2, ingest file s2 with k3@3 - insert k4@4 to m1 - compact files s1, s2 and result in new file s3 of seqno range [2, 3] - flush m1 and result in new file s4 of seqno range [1, 4]. And `force_consistency_check=true` will think s4 and s3 has file reordering corruption that might cause retuning an old value of k1 - However such caught corruption is a false positive since s1, s2 will not have overlapped keys with k1 or whatever inserted into m1 before ingest file s1 by the requirement of file ingestion (otherwise the m1 will be flushed first before any of the file ingestion completes). Therefore there in fact isn't any file reordering corruption. - Single delete can decrease a file's largest seqno and ordering by `largest_seqno` can introduce a wrong ordering hence file reordering corruption - For example, consider the following sequence of events ("key@n" indicates key at seqno "n", Credit to ajkr for this example) - an existing SST s1 contains only k1@1 - insert k1@2 to memtable m1 - ingest file s2 with k3@3, ingest file s3 with k4@4 - insert single delete k5@5 in m1 - flush m1 and result in new file s4 of seqno range [2, 5] - compact s1, s2, s3 and result in new file s5 of seqno range [1, 4] - compact s4 and result in new file s6 of seqno range [2] due to single delete - By the last step, we have file ordering by largest seqno (">" means "newer") : s5 > s6 while s6 contains a newer version of the k1's value (i.e, k1@2) than s5, which is a real reordering corruption. While this can be caught by `force_consistency_check=true`, there isn't a good way to prevent this from happening if ordering by `largest_seqno` Therefore, we are redesigning the sorting criteria of L0 files and avoid above inconvenience. Credit to ajkr , we now introduce `epoch_num` which describes the order of a file being flushed or ingested/imported (compaction output file will has the minimum `epoch_num` among input files'). This will avoid the above inconvenience in the following ways: - In the first case above, there will no longer be overlap seqno range check in `force_consistency_check=true` but `epoch_number` ordering check. This will result in file ordering s1 < s2 < s4 (pre-compaction) and s3 < s4 (post-compaction) which won't trigger false positive corruption. See test class `DBCompactionTestL0FilesMisorderCorruption*` for more. - In the second case above, this will result in file ordering s1 < s2 < s3 < s4 (pre-compacting s1, s2, s3), s5 < s4 (post-compacting s1, s2, s3), s5 < s6 (post-compacting s4), which are correct file ordering without causing any corruption. **Summary:** - Introduce `epoch_number` stored per `ColumnFamilyData` and sort CF's L0 files by their assigned `epoch_number` instead of `largest_seqno`. - `epoch_number` is increased and assigned upon `VersionEdit::AddFile()` for flush (or similarly for WriteLevel0TableForRecovery) and file ingestion (except for allow_behind_true, which will always get assigned as the `kReservedEpochNumberForFileIngestedBehind`) - Compaction output file is assigned with the minimum `epoch_number` among input files' - Refit level: reuse refitted file's epoch_number - Other paths needing `epoch_number` treatment: - Import column families: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo` - Repair: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`. - Assigning new epoch_number to a file and adding this file to LSM tree should be atomic. This is guaranteed by us assigning epoch_number right upon `VersionEdit::AddFile()` where this version edit will be apply to LSM tree shape right after by holding the db mutex (e.g, flush, file ingestion, import column family) or by there is only 1 ongoing edit per CF (e.g, WriteLevel0TableForRecovery, Repair). - Assigning the minimum input epoch number to compaction output file won't misorder L0 files (even through later `Refit(target_level=0)`). It's due to for every key "k" in the input range, a legit compaction will cover a continuous epoch number range of that key. As long as we assign the key "k" the minimum input epoch number, it won't become newer or older than the versions of this key that aren't included in this compaction hence no misorder. - Persist `epoch_number` of each file in manifest and recover `epoch_number` on db recovery - Backward compatibility with old db without `epoch_number` support is guaranteed by assigning `epoch_number` to recovered files by `NewestFirstBySeqno` order. See `VersionStorageInfo::RecoverEpochNumbers()` for more - Forward compatibility with manifest is guaranteed by flexibility of `NewFileCustomTag` - Replace `force_consistent_check` on L0 with `epoch_number` and remove false positive check like case 1 with `largest_seqno` above - Due to backward compatibility issue, we might encounter files with missing epoch number at the beginning of db recovery. We will still use old L0 sorting mechanism (`NewestFirstBySeqno`) to check/sort them till we infer their epoch number. See usages of `EpochNumberRequirement`. - Remove fix https://github.com/facebook/rocksdb/pull/5958#issue-511150930 and their outdated tests to file reordering corruption because such fix can be replaced by this PR. - Misc: - update existing tests with `epoch_number` so make check will pass - update https://github.com/facebook/rocksdb/pull/5958#issue-511150930 tests to verify corruption is fixed using `epoch_number` and cover universal/fifo compaction/CompactRange/CompactFile cases - assert db_mutex is held for a few places before calling ColumnFamilyData::NewEpochNumber() Pull Request resolved: https://github.com/facebook/rocksdb/pull/10922 Test Plan: - `make check` - New unit tests under `db/db_compaction_test.cc`, `db/db_test2.cc`, `db/version_builder_test.cc`, `db/repair_test.cc` - Updated tests (i.e, `DBCompactionTestL0FilesMisorderCorruption*`) under https://github.com/facebook/rocksdb/pull/5958#issue-511150930 - [Ongoing] Compatibility test: manually run https://github.com/ajkr/rocksdb/commit/36a5686ec012f35a4371e409aa85c404ca1c210d (with file ingestion off for running the `.orig` binary to prevent this bug affecting upgrade/downgrade formality checking) for 1 hour on `simple black/white box`, `cf_consistency/txn/enable_ts with whitebox + test_best_efforts_recovery with blackbox` - [Ongoing] normal db stress test - [Ongoing] db stress test with aggressive value https://github.com/facebook/rocksdb/pull/10761 Reviewed By: ajkr Differential Revision: D41063187 Pulled By: hx235 fbshipit-source-id: 826cb23455de7beaabe2d16c57682a82733a32a9
2022-12-13 21:29:37 +00:00
const std::shared_ptr<IOTracer>& io_tracer,
Group rocksdb.sst.read.micros stat by IOActivity flush and compaction (#11288) Summary: **Context:** The existing stat rocksdb.sst.read.micros does not reflect each of compaction and flush cases but aggregate them, which is not so helpful for us to understand IO read behavior of each of them. **Summary** - Update `StopWatch` and `RandomAccessFileReader` to record `rocksdb.sst.read.micros` and `rocksdb.file.{flush/compaction}.read.micros` - Fixed the default histogram in `RandomAccessFileReader` - New field `ReadOptions/IOOptions::io_activity`; Pass `ReadOptions` through paths under db open, flush and compaction to where we can prepare `IOOptions` and pass it to `RandomAccessFileReader` - Use `thread_status_util` for assertion in `DbStressFSWrapper` for continuous testing on we are passing correct `io_activity` under db open, flush and compaction Pull Request resolved: https://github.com/facebook/rocksdb/pull/11288 Test Plan: - **Stress test** - **Db bench 1: rocksdb.sst.read.micros COUNT ≈ sum of rocksdb.file.read.flush.micros's and rocksdb.file.read.compaction.micros's.** (without blob) - May not be exactly the same due to `HistogramStat::Add` only guarantees atomic not accuracy across threads. ``` ./db_bench -db=/dev/shm/testdb/ -statistics=true -benchmarks="fillseq" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -target_file_size_base=655 -disable_auto_compactions=false -compression_type=none -bloom_bits=3 (-use_plain_table=1 -prefix_size=10) ``` ``` // BlockBasedTable rocksdb.sst.read.micros P50 : 2.009374 P95 : 4.968548 P99 : 8.110362 P100 : 43.000000 COUNT : 40456 SUM : 114805 rocksdb.file.read.flush.micros P50 : 1.871841 P95 : 3.872407 P99 : 5.540541 P100 : 43.000000 COUNT : 2250 SUM : 6116 rocksdb.file.read.compaction.micros P50 : 2.023109 P95 : 5.029149 P99 : 8.196910 P100 : 26.000000 COUNT : 38206 SUM : 108689 // PlainTable Does not apply ``` - **Db bench 2: performance** **Read** SETUP: db with 900 files ``` ./db_bench -db=/dev/shm/testdb/ -benchmarks="fillseq" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -disable_auto_compactions=true -target_file_size_base=655 -compression_type=none ```run till convergence ``` ./db_bench -seed=1678564177044286 -use_existing_db=true -db=/dev/shm/testdb -benchmarks=readrandom[-X60] -statistics=true -num=1000000 -disable_auto_compactions=true -compression_type=none -bloom_bits=3 ``` Pre-change `readrandom [AVG 60 runs] : 21568 (± 248) ops/sec` Post-change (no regression, -0.3%) `readrandom [AVG 60 runs] : 21486 (± 236) ops/sec` **Compaction/Flush**run till convergence ``` ./db_bench -db=/dev/shm/testdb2/ -seed=1678564177044286 -benchmarks="fillseq[-X60]" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -disable_auto_compactions=false -target_file_size_base=655 -compression_type=none rocksdb.sst.read.micros COUNT : 33820 rocksdb.sst.read.flush.micros COUNT : 1800 rocksdb.sst.read.compaction.micros COUNT : 32020 ``` Pre-change `fillseq [AVG 46 runs] : 1391 (± 214) ops/sec; 0.7 (± 0.1) MB/sec` Post-change (no regression, ~-0.4%) `fillseq [AVG 46 runs] : 1385 (± 216) ops/sec; 0.7 (± 0.1) MB/sec` Reviewed By: ajkr Differential Revision: D44007011 Pulled By: hx235 fbshipit-source-id: a54c89e4846dfc9a135389edf3f3eedfea257132
2023-04-21 16:07:18 +00:00
const ReadOptions& read_options,
Sort L0 files by newly introduced epoch_num (#10922) Summary: **Context:** Sorting L0 files by `largest_seqno` has at least two inconvenience: - File ingestion and compaction involving ingested files can create files of overlapping seqno range with the existing files. `force_consistency_check=true` will catch such overlap seqno range even those harmless overlap. - For example, consider the following sequence of events ("key@n" indicates key at seqno "n") - insert k1@1 to memtable m1 - ingest file s1 with k2@2, ingest file s2 with k3@3 - insert k4@4 to m1 - compact files s1, s2 and result in new file s3 of seqno range [2, 3] - flush m1 and result in new file s4 of seqno range [1, 4]. And `force_consistency_check=true` will think s4 and s3 has file reordering corruption that might cause retuning an old value of k1 - However such caught corruption is a false positive since s1, s2 will not have overlapped keys with k1 or whatever inserted into m1 before ingest file s1 by the requirement of file ingestion (otherwise the m1 will be flushed first before any of the file ingestion completes). Therefore there in fact isn't any file reordering corruption. - Single delete can decrease a file's largest seqno and ordering by `largest_seqno` can introduce a wrong ordering hence file reordering corruption - For example, consider the following sequence of events ("key@n" indicates key at seqno "n", Credit to ajkr for this example) - an existing SST s1 contains only k1@1 - insert k1@2 to memtable m1 - ingest file s2 with k3@3, ingest file s3 with k4@4 - insert single delete k5@5 in m1 - flush m1 and result in new file s4 of seqno range [2, 5] - compact s1, s2, s3 and result in new file s5 of seqno range [1, 4] - compact s4 and result in new file s6 of seqno range [2] due to single delete - By the last step, we have file ordering by largest seqno (">" means "newer") : s5 > s6 while s6 contains a newer version of the k1's value (i.e, k1@2) than s5, which is a real reordering corruption. While this can be caught by `force_consistency_check=true`, there isn't a good way to prevent this from happening if ordering by `largest_seqno` Therefore, we are redesigning the sorting criteria of L0 files and avoid above inconvenience. Credit to ajkr , we now introduce `epoch_num` which describes the order of a file being flushed or ingested/imported (compaction output file will has the minimum `epoch_num` among input files'). This will avoid the above inconvenience in the following ways: - In the first case above, there will no longer be overlap seqno range check in `force_consistency_check=true` but `epoch_number` ordering check. This will result in file ordering s1 < s2 < s4 (pre-compaction) and s3 < s4 (post-compaction) which won't trigger false positive corruption. See test class `DBCompactionTestL0FilesMisorderCorruption*` for more. - In the second case above, this will result in file ordering s1 < s2 < s3 < s4 (pre-compacting s1, s2, s3), s5 < s4 (post-compacting s1, s2, s3), s5 < s6 (post-compacting s4), which are correct file ordering without causing any corruption. **Summary:** - Introduce `epoch_number` stored per `ColumnFamilyData` and sort CF's L0 files by their assigned `epoch_number` instead of `largest_seqno`. - `epoch_number` is increased and assigned upon `VersionEdit::AddFile()` for flush (or similarly for WriteLevel0TableForRecovery) and file ingestion (except for allow_behind_true, which will always get assigned as the `kReservedEpochNumberForFileIngestedBehind`) - Compaction output file is assigned with the minimum `epoch_number` among input files' - Refit level: reuse refitted file's epoch_number - Other paths needing `epoch_number` treatment: - Import column families: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo` - Repair: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`. - Assigning new epoch_number to a file and adding this file to LSM tree should be atomic. This is guaranteed by us assigning epoch_number right upon `VersionEdit::AddFile()` where this version edit will be apply to LSM tree shape right after by holding the db mutex (e.g, flush, file ingestion, import column family) or by there is only 1 ongoing edit per CF (e.g, WriteLevel0TableForRecovery, Repair). - Assigning the minimum input epoch number to compaction output file won't misorder L0 files (even through later `Refit(target_level=0)`). It's due to for every key "k" in the input range, a legit compaction will cover a continuous epoch number range of that key. As long as we assign the key "k" the minimum input epoch number, it won't become newer or older than the versions of this key that aren't included in this compaction hence no misorder. - Persist `epoch_number` of each file in manifest and recover `epoch_number` on db recovery - Backward compatibility with old db without `epoch_number` support is guaranteed by assigning `epoch_number` to recovered files by `NewestFirstBySeqno` order. See `VersionStorageInfo::RecoverEpochNumbers()` for more - Forward compatibility with manifest is guaranteed by flexibility of `NewFileCustomTag` - Replace `force_consistent_check` on L0 with `epoch_number` and remove false positive check like case 1 with `largest_seqno` above - Due to backward compatibility issue, we might encounter files with missing epoch number at the beginning of db recovery. We will still use old L0 sorting mechanism (`NewestFirstBySeqno`) to check/sort them till we infer their epoch number. See usages of `EpochNumberRequirement`. - Remove fix https://github.com/facebook/rocksdb/pull/5958#issue-511150930 and their outdated tests to file reordering corruption because such fix can be replaced by this PR. - Misc: - update existing tests with `epoch_number` so make check will pass - update https://github.com/facebook/rocksdb/pull/5958#issue-511150930 tests to verify corruption is fixed using `epoch_number` and cover universal/fifo compaction/CompactRange/CompactFile cases - assert db_mutex is held for a few places before calling ColumnFamilyData::NewEpochNumber() Pull Request resolved: https://github.com/facebook/rocksdb/pull/10922 Test Plan: - `make check` - New unit tests under `db/db_compaction_test.cc`, `db/db_test2.cc`, `db/version_builder_test.cc`, `db/repair_test.cc` - Updated tests (i.e, `DBCompactionTestL0FilesMisorderCorruption*`) under https://github.com/facebook/rocksdb/pull/5958#issue-511150930 - [Ongoing] Compatibility test: manually run https://github.com/ajkr/rocksdb/commit/36a5686ec012f35a4371e409aa85c404ca1c210d (with file ingestion off for running the `.orig` binary to prevent this bug affecting upgrade/downgrade formality checking) for 1 hour on `simple black/white box`, `cf_consistency/txn/enable_ts with whitebox + test_best_efforts_recovery with blackbox` - [Ongoing] normal db stress test - [Ongoing] db stress test with aggressive value https://github.com/facebook/rocksdb/pull/10761 Reviewed By: ajkr Differential Revision: D41063187 Pulled By: hx235 fbshipit-source-id: 826cb23455de7beaabe2d16c57682a82733a32a9
2022-12-13 21:29:37 +00:00
EpochNumberRequirement epoch_number_requirement =
EpochNumberRequirement::kMustPresent)
: VersionEditHandlerPointInTime(/*read_only=*/false, column_families,
Group rocksdb.sst.read.micros stat by IOActivity flush and compaction (#11288) Summary: **Context:** The existing stat rocksdb.sst.read.micros does not reflect each of compaction and flush cases but aggregate them, which is not so helpful for us to understand IO read behavior of each of them. **Summary** - Update `StopWatch` and `RandomAccessFileReader` to record `rocksdb.sst.read.micros` and `rocksdb.file.{flush/compaction}.read.micros` - Fixed the default histogram in `RandomAccessFileReader` - New field `ReadOptions/IOOptions::io_activity`; Pass `ReadOptions` through paths under db open, flush and compaction to where we can prepare `IOOptions` and pass it to `RandomAccessFileReader` - Use `thread_status_util` for assertion in `DbStressFSWrapper` for continuous testing on we are passing correct `io_activity` under db open, flush and compaction Pull Request resolved: https://github.com/facebook/rocksdb/pull/11288 Test Plan: - **Stress test** - **Db bench 1: rocksdb.sst.read.micros COUNT ≈ sum of rocksdb.file.read.flush.micros's and rocksdb.file.read.compaction.micros's.** (without blob) - May not be exactly the same due to `HistogramStat::Add` only guarantees atomic not accuracy across threads. ``` ./db_bench -db=/dev/shm/testdb/ -statistics=true -benchmarks="fillseq" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -target_file_size_base=655 -disable_auto_compactions=false -compression_type=none -bloom_bits=3 (-use_plain_table=1 -prefix_size=10) ``` ``` // BlockBasedTable rocksdb.sst.read.micros P50 : 2.009374 P95 : 4.968548 P99 : 8.110362 P100 : 43.000000 COUNT : 40456 SUM : 114805 rocksdb.file.read.flush.micros P50 : 1.871841 P95 : 3.872407 P99 : 5.540541 P100 : 43.000000 COUNT : 2250 SUM : 6116 rocksdb.file.read.compaction.micros P50 : 2.023109 P95 : 5.029149 P99 : 8.196910 P100 : 26.000000 COUNT : 38206 SUM : 108689 // PlainTable Does not apply ``` - **Db bench 2: performance** **Read** SETUP: db with 900 files ``` ./db_bench -db=/dev/shm/testdb/ -benchmarks="fillseq" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -disable_auto_compactions=true -target_file_size_base=655 -compression_type=none ```run till convergence ``` ./db_bench -seed=1678564177044286 -use_existing_db=true -db=/dev/shm/testdb -benchmarks=readrandom[-X60] -statistics=true -num=1000000 -disable_auto_compactions=true -compression_type=none -bloom_bits=3 ``` Pre-change `readrandom [AVG 60 runs] : 21568 (± 248) ops/sec` Post-change (no regression, -0.3%) `readrandom [AVG 60 runs] : 21486 (± 236) ops/sec` **Compaction/Flush**run till convergence ``` ./db_bench -db=/dev/shm/testdb2/ -seed=1678564177044286 -benchmarks="fillseq[-X60]" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -disable_auto_compactions=false -target_file_size_base=655 -compression_type=none rocksdb.sst.read.micros COUNT : 33820 rocksdb.sst.read.flush.micros COUNT : 1800 rocksdb.sst.read.compaction.micros COUNT : 32020 ``` Pre-change `fillseq [AVG 46 runs] : 1391 (± 214) ops/sec; 0.7 (± 0.1) MB/sec` Post-change (no regression, ~-0.4%) `fillseq [AVG 46 runs] : 1385 (± 216) ops/sec; 0.7 (± 0.1) MB/sec` Reviewed By: ajkr Differential Revision: D44007011 Pulled By: hx235 fbshipit-source-id: a54c89e4846dfc9a135389edf3f3eedfea257132
2023-04-21 16:07:18 +00:00
version_set, io_tracer, read_options,
Sort L0 files by newly introduced epoch_num (#10922) Summary: **Context:** Sorting L0 files by `largest_seqno` has at least two inconvenience: - File ingestion and compaction involving ingested files can create files of overlapping seqno range with the existing files. `force_consistency_check=true` will catch such overlap seqno range even those harmless overlap. - For example, consider the following sequence of events ("key@n" indicates key at seqno "n") - insert k1@1 to memtable m1 - ingest file s1 with k2@2, ingest file s2 with k3@3 - insert k4@4 to m1 - compact files s1, s2 and result in new file s3 of seqno range [2, 3] - flush m1 and result in new file s4 of seqno range [1, 4]. And `force_consistency_check=true` will think s4 and s3 has file reordering corruption that might cause retuning an old value of k1 - However such caught corruption is a false positive since s1, s2 will not have overlapped keys with k1 or whatever inserted into m1 before ingest file s1 by the requirement of file ingestion (otherwise the m1 will be flushed first before any of the file ingestion completes). Therefore there in fact isn't any file reordering corruption. - Single delete can decrease a file's largest seqno and ordering by `largest_seqno` can introduce a wrong ordering hence file reordering corruption - For example, consider the following sequence of events ("key@n" indicates key at seqno "n", Credit to ajkr for this example) - an existing SST s1 contains only k1@1 - insert k1@2 to memtable m1 - ingest file s2 with k3@3, ingest file s3 with k4@4 - insert single delete k5@5 in m1 - flush m1 and result in new file s4 of seqno range [2, 5] - compact s1, s2, s3 and result in new file s5 of seqno range [1, 4] - compact s4 and result in new file s6 of seqno range [2] due to single delete - By the last step, we have file ordering by largest seqno (">" means "newer") : s5 > s6 while s6 contains a newer version of the k1's value (i.e, k1@2) than s5, which is a real reordering corruption. While this can be caught by `force_consistency_check=true`, there isn't a good way to prevent this from happening if ordering by `largest_seqno` Therefore, we are redesigning the sorting criteria of L0 files and avoid above inconvenience. Credit to ajkr , we now introduce `epoch_num` which describes the order of a file being flushed or ingested/imported (compaction output file will has the minimum `epoch_num` among input files'). This will avoid the above inconvenience in the following ways: - In the first case above, there will no longer be overlap seqno range check in `force_consistency_check=true` but `epoch_number` ordering check. This will result in file ordering s1 < s2 < s4 (pre-compaction) and s3 < s4 (post-compaction) which won't trigger false positive corruption. See test class `DBCompactionTestL0FilesMisorderCorruption*` for more. - In the second case above, this will result in file ordering s1 < s2 < s3 < s4 (pre-compacting s1, s2, s3), s5 < s4 (post-compacting s1, s2, s3), s5 < s6 (post-compacting s4), which are correct file ordering without causing any corruption. **Summary:** - Introduce `epoch_number` stored per `ColumnFamilyData` and sort CF's L0 files by their assigned `epoch_number` instead of `largest_seqno`. - `epoch_number` is increased and assigned upon `VersionEdit::AddFile()` for flush (or similarly for WriteLevel0TableForRecovery) and file ingestion (except for allow_behind_true, which will always get assigned as the `kReservedEpochNumberForFileIngestedBehind`) - Compaction output file is assigned with the minimum `epoch_number` among input files' - Refit level: reuse refitted file's epoch_number - Other paths needing `epoch_number` treatment: - Import column families: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo` - Repair: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`. - Assigning new epoch_number to a file and adding this file to LSM tree should be atomic. This is guaranteed by us assigning epoch_number right upon `VersionEdit::AddFile()` where this version edit will be apply to LSM tree shape right after by holding the db mutex (e.g, flush, file ingestion, import column family) or by there is only 1 ongoing edit per CF (e.g, WriteLevel0TableForRecovery, Repair). - Assigning the minimum input epoch number to compaction output file won't misorder L0 files (even through later `Refit(target_level=0)`). It's due to for every key "k" in the input range, a legit compaction will cover a continuous epoch number range of that key. As long as we assign the key "k" the minimum input epoch number, it won't become newer or older than the versions of this key that aren't included in this compaction hence no misorder. - Persist `epoch_number` of each file in manifest and recover `epoch_number` on db recovery - Backward compatibility with old db without `epoch_number` support is guaranteed by assigning `epoch_number` to recovered files by `NewestFirstBySeqno` order. See `VersionStorageInfo::RecoverEpochNumbers()` for more - Forward compatibility with manifest is guaranteed by flexibility of `NewFileCustomTag` - Replace `force_consistent_check` on L0 with `epoch_number` and remove false positive check like case 1 with `largest_seqno` above - Due to backward compatibility issue, we might encounter files with missing epoch number at the beginning of db recovery. We will still use old L0 sorting mechanism (`NewestFirstBySeqno`) to check/sort them till we infer their epoch number. See usages of `EpochNumberRequirement`. - Remove fix https://github.com/facebook/rocksdb/pull/5958#issue-511150930 and their outdated tests to file reordering corruption because such fix can be replaced by this PR. - Misc: - update existing tests with `epoch_number` so make check will pass - update https://github.com/facebook/rocksdb/pull/5958#issue-511150930 tests to verify corruption is fixed using `epoch_number` and cover universal/fifo compaction/CompactRange/CompactFile cases - assert db_mutex is held for a few places before calling ColumnFamilyData::NewEpochNumber() Pull Request resolved: https://github.com/facebook/rocksdb/pull/10922 Test Plan: - `make check` - New unit tests under `db/db_compaction_test.cc`, `db/db_test2.cc`, `db/version_builder_test.cc`, `db/repair_test.cc` - Updated tests (i.e, `DBCompactionTestL0FilesMisorderCorruption*`) under https://github.com/facebook/rocksdb/pull/5958#issue-511150930 - [Ongoing] Compatibility test: manually run https://github.com/ajkr/rocksdb/commit/36a5686ec012f35a4371e409aa85c404ca1c210d (with file ingestion off for running the `.orig` binary to prevent this bug affecting upgrade/downgrade formality checking) for 1 hour on `simple black/white box`, `cf_consistency/txn/enable_ts with whitebox + test_best_efforts_recovery with blackbox` - [Ongoing] normal db stress test - [Ongoing] db stress test with aggressive value https://github.com/facebook/rocksdb/pull/10761 Reviewed By: ajkr Differential Revision: D41063187 Pulled By: hx235 fbshipit-source-id: 826cb23455de7beaabe2d16c57682a82733a32a9
2022-12-13 21:29:37 +00:00
epoch_number_requirement),
mode_(Mode::kRecovery) {}
void PrepareToReadNewManifest() {
initialized_ = false;
ClearReadBuffer();
}
std::unordered_set<ColumnFamilyData*>& GetUpdatedColumnFamilies() {
return cfds_changed_;
}
protected:
Status Initialize() override;
bool MustOpenAllColumnFamilies() const override { return false; }
Status ApplyVersionEdit(VersionEdit& edit, ColumnFamilyData** cfd) override;
Status OnColumnFamilyAdd(VersionEdit& edit, ColumnFamilyData** cfd) override;
void CheckIterationResult(const log::Reader& reader, Status* s) override;
Status VerifyFile(ColumnFamilyData* cfd, const std::string& fpath, int level,
const FileMetaData& fmeta) override;
enum Mode : uint8_t {
kRecovery = 0,
kCatchUp = 1,
};
Mode mode_;
std::unordered_set<ColumnFamilyData*> cfds_changed_;
};
class DumpManifestHandler : public VersionEditHandler {
public:
DumpManifestHandler(std::vector<ColumnFamilyDescriptor> column_families,
VersionSet* version_set,
Group rocksdb.sst.read.micros stat by IOActivity flush and compaction (#11288) Summary: **Context:** The existing stat rocksdb.sst.read.micros does not reflect each of compaction and flush cases but aggregate them, which is not so helpful for us to understand IO read behavior of each of them. **Summary** - Update `StopWatch` and `RandomAccessFileReader` to record `rocksdb.sst.read.micros` and `rocksdb.file.{flush/compaction}.read.micros` - Fixed the default histogram in `RandomAccessFileReader` - New field `ReadOptions/IOOptions::io_activity`; Pass `ReadOptions` through paths under db open, flush and compaction to where we can prepare `IOOptions` and pass it to `RandomAccessFileReader` - Use `thread_status_util` for assertion in `DbStressFSWrapper` for continuous testing on we are passing correct `io_activity` under db open, flush and compaction Pull Request resolved: https://github.com/facebook/rocksdb/pull/11288 Test Plan: - **Stress test** - **Db bench 1: rocksdb.sst.read.micros COUNT ≈ sum of rocksdb.file.read.flush.micros's and rocksdb.file.read.compaction.micros's.** (without blob) - May not be exactly the same due to `HistogramStat::Add` only guarantees atomic not accuracy across threads. ``` ./db_bench -db=/dev/shm/testdb/ -statistics=true -benchmarks="fillseq" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -target_file_size_base=655 -disable_auto_compactions=false -compression_type=none -bloom_bits=3 (-use_plain_table=1 -prefix_size=10) ``` ``` // BlockBasedTable rocksdb.sst.read.micros P50 : 2.009374 P95 : 4.968548 P99 : 8.110362 P100 : 43.000000 COUNT : 40456 SUM : 114805 rocksdb.file.read.flush.micros P50 : 1.871841 P95 : 3.872407 P99 : 5.540541 P100 : 43.000000 COUNT : 2250 SUM : 6116 rocksdb.file.read.compaction.micros P50 : 2.023109 P95 : 5.029149 P99 : 8.196910 P100 : 26.000000 COUNT : 38206 SUM : 108689 // PlainTable Does not apply ``` - **Db bench 2: performance** **Read** SETUP: db with 900 files ``` ./db_bench -db=/dev/shm/testdb/ -benchmarks="fillseq" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -disable_auto_compactions=true -target_file_size_base=655 -compression_type=none ```run till convergence ``` ./db_bench -seed=1678564177044286 -use_existing_db=true -db=/dev/shm/testdb -benchmarks=readrandom[-X60] -statistics=true -num=1000000 -disable_auto_compactions=true -compression_type=none -bloom_bits=3 ``` Pre-change `readrandom [AVG 60 runs] : 21568 (± 248) ops/sec` Post-change (no regression, -0.3%) `readrandom [AVG 60 runs] : 21486 (± 236) ops/sec` **Compaction/Flush**run till convergence ``` ./db_bench -db=/dev/shm/testdb2/ -seed=1678564177044286 -benchmarks="fillseq[-X60]" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -disable_auto_compactions=false -target_file_size_base=655 -compression_type=none rocksdb.sst.read.micros COUNT : 33820 rocksdb.sst.read.flush.micros COUNT : 1800 rocksdb.sst.read.compaction.micros COUNT : 32020 ``` Pre-change `fillseq [AVG 46 runs] : 1391 (± 214) ops/sec; 0.7 (± 0.1) MB/sec` Post-change (no regression, ~-0.4%) `fillseq [AVG 46 runs] : 1385 (± 216) ops/sec; 0.7 (± 0.1) MB/sec` Reviewed By: ajkr Differential Revision: D44007011 Pulled By: hx235 fbshipit-source-id: a54c89e4846dfc9a135389edf3f3eedfea257132
2023-04-21 16:07:18 +00:00
const std::shared_ptr<IOTracer>& io_tracer,
const ReadOptions& read_options, bool verbose, bool hex,
bool json)
: VersionEditHandler(
/*read_only=*/true, column_families, version_set,
/*track_missing_files=*/false,
Group rocksdb.sst.read.micros stat by IOActivity flush and compaction (#11288) Summary: **Context:** The existing stat rocksdb.sst.read.micros does not reflect each of compaction and flush cases but aggregate them, which is not so helpful for us to understand IO read behavior of each of them. **Summary** - Update `StopWatch` and `RandomAccessFileReader` to record `rocksdb.sst.read.micros` and `rocksdb.file.{flush/compaction}.read.micros` - Fixed the default histogram in `RandomAccessFileReader` - New field `ReadOptions/IOOptions::io_activity`; Pass `ReadOptions` through paths under db open, flush and compaction to where we can prepare `IOOptions` and pass it to `RandomAccessFileReader` - Use `thread_status_util` for assertion in `DbStressFSWrapper` for continuous testing on we are passing correct `io_activity` under db open, flush and compaction Pull Request resolved: https://github.com/facebook/rocksdb/pull/11288 Test Plan: - **Stress test** - **Db bench 1: rocksdb.sst.read.micros COUNT ≈ sum of rocksdb.file.read.flush.micros's and rocksdb.file.read.compaction.micros's.** (without blob) - May not be exactly the same due to `HistogramStat::Add` only guarantees atomic not accuracy across threads. ``` ./db_bench -db=/dev/shm/testdb/ -statistics=true -benchmarks="fillseq" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -target_file_size_base=655 -disable_auto_compactions=false -compression_type=none -bloom_bits=3 (-use_plain_table=1 -prefix_size=10) ``` ``` // BlockBasedTable rocksdb.sst.read.micros P50 : 2.009374 P95 : 4.968548 P99 : 8.110362 P100 : 43.000000 COUNT : 40456 SUM : 114805 rocksdb.file.read.flush.micros P50 : 1.871841 P95 : 3.872407 P99 : 5.540541 P100 : 43.000000 COUNT : 2250 SUM : 6116 rocksdb.file.read.compaction.micros P50 : 2.023109 P95 : 5.029149 P99 : 8.196910 P100 : 26.000000 COUNT : 38206 SUM : 108689 // PlainTable Does not apply ``` - **Db bench 2: performance** **Read** SETUP: db with 900 files ``` ./db_bench -db=/dev/shm/testdb/ -benchmarks="fillseq" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -disable_auto_compactions=true -target_file_size_base=655 -compression_type=none ```run till convergence ``` ./db_bench -seed=1678564177044286 -use_existing_db=true -db=/dev/shm/testdb -benchmarks=readrandom[-X60] -statistics=true -num=1000000 -disable_auto_compactions=true -compression_type=none -bloom_bits=3 ``` Pre-change `readrandom [AVG 60 runs] : 21568 (± 248) ops/sec` Post-change (no regression, -0.3%) `readrandom [AVG 60 runs] : 21486 (± 236) ops/sec` **Compaction/Flush**run till convergence ``` ./db_bench -db=/dev/shm/testdb2/ -seed=1678564177044286 -benchmarks="fillseq[-X60]" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -disable_auto_compactions=false -target_file_size_base=655 -compression_type=none rocksdb.sst.read.micros COUNT : 33820 rocksdb.sst.read.flush.micros COUNT : 1800 rocksdb.sst.read.compaction.micros COUNT : 32020 ``` Pre-change `fillseq [AVG 46 runs] : 1391 (± 214) ops/sec; 0.7 (± 0.1) MB/sec` Post-change (no regression, ~-0.4%) `fillseq [AVG 46 runs] : 1385 (± 216) ops/sec; 0.7 (± 0.1) MB/sec` Reviewed By: ajkr Differential Revision: D44007011 Pulled By: hx235 fbshipit-source-id: a54c89e4846dfc9a135389edf3f3eedfea257132
2023-04-21 16:07:18 +00:00
/*no_error_if_files_missing=*/false, io_tracer, read_options,
/*skip_load_table_files=*/true),
verbose_(verbose),
hex_(hex),
json_(json),
count_(0) {
cf_to_cmp_names_.reset(new std::unordered_map<uint32_t, std::string>());
}
~DumpManifestHandler() override {}
Status ApplyVersionEdit(VersionEdit& edit, ColumnFamilyData** cfd) override {
// Write out each individual edit
if (verbose_ && !json_) {
// Print out DebugStrings. Can include non-terminating null characters.
fwrite(edit.DebugString(hex_).data(), sizeof(char),
edit.DebugString(hex_).size(), stdout);
} else if (json_) {
// Print out DebugStrings. Can include non-terminating null characters.
fwrite(edit.DebugString(hex_).data(), sizeof(char),
edit.DebugString(hex_).size(), stdout);
}
++count_;
return VersionEditHandler::ApplyVersionEdit(edit, cfd);
}
void CheckIterationResult(const log::Reader& reader, Status* s) override;
private:
const bool verbose_;
const bool hex_;
const bool json_;
int count_;
};
} // namespace ROCKSDB_NAMESPACE