2016-02-09 23:12:00 +00:00
|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
2017-07-15 23:03:42 +00:00
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
2014-10-29 00:52:32 +00:00
|
|
|
#pragma once
|
2015-09-02 20:58:22 +00:00
|
|
|
|
2014-10-29 00:52:32 +00:00
|
|
|
#include <algorithm>
|
2014-10-29 01:10:55 +00:00
|
|
|
#include <atomic>
|
2014-10-29 00:52:32 +00:00
|
|
|
#include <map>
|
2015-09-02 20:58:22 +00:00
|
|
|
#include <memory>
|
|
|
|
#include <set>
|
2014-10-29 00:52:32 +00:00
|
|
|
#include <string>
|
2015-09-02 20:58:22 +00:00
|
|
|
#include <utility>
|
2014-10-29 00:52:32 +00:00
|
|
|
|
2020-02-10 23:42:46 +00:00
|
|
|
#include "db/version_edit.h"
|
2015-09-02 20:58:22 +00:00
|
|
|
#include "port/port.h"
|
2015-08-08 04:59:51 +00:00
|
|
|
#include "rocksdb/comparator.h"
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
2020-03-27 23:03:05 +00:00
|
|
|
#include "rocksdb/io_status.h"
|
2014-10-29 00:52:32 +00:00
|
|
|
#include "rocksdb/table.h"
|
2015-10-12 22:06:38 +00:00
|
|
|
#include "table/internal_iterator.h"
|
2014-10-29 00:52:32 +00:00
|
|
|
#include "table/table_builder.h"
|
2015-09-02 20:58:22 +00:00
|
|
|
#include "table/table_reader.h"
|
2019-05-30 18:21:38 +00:00
|
|
|
#include "test_util/testharness.h"
|
|
|
|
#include "test_util/testutil.h"
|
2019-05-31 00:39:43 +00:00
|
|
|
#include "util/kv_map.h"
|
|
|
|
#include "util/mutexlock.h"
|
2014-10-29 00:52:32 +00:00
|
|
|
|
2020-02-20 20:07:53 +00:00
|
|
|
namespace ROCKSDB_NAMESPACE {
|
2014-11-14 19:35:48 +00:00
|
|
|
namespace mock {
|
2020-10-01 17:08:52 +00:00
|
|
|
using KVPair = std::pair<std::string, std::string>;
|
|
|
|
using KVVector = std::vector<KVPair>;
|
2014-10-29 00:52:32 +00:00
|
|
|
|
2020-10-01 17:08:52 +00:00
|
|
|
KVVector MakeMockFile(std::initializer_list<KVPair> l = {});
|
2020-11-12 19:40:52 +00:00
|
|
|
void SortKVVector(KVVector* kv_vector,
|
|
|
|
const Comparator* ucmp = BytewiseComparator());
|
2014-10-29 00:52:32 +00:00
|
|
|
|
|
|
|
struct MockTableFileSystem {
|
|
|
|
port::Mutex mutex;
|
2020-10-01 17:08:52 +00:00
|
|
|
std::map<uint32_t, KVVector> files;
|
2014-10-29 00:52:32 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
class MockTableFactory : public TableFactory {
|
|
|
|
public:
|
2020-07-22 18:03:29 +00:00
|
|
|
enum MockCorruptionMode {
|
|
|
|
kCorruptNone,
|
|
|
|
kCorruptKey,
|
|
|
|
kCorruptValue,
|
2020-10-01 17:08:52 +00:00
|
|
|
kCorruptReorderKey,
|
2020-07-22 18:03:29 +00:00
|
|
|
};
|
|
|
|
|
2014-10-29 00:52:32 +00:00
|
|
|
MockTableFactory();
|
2021-08-19 17:09:30 +00:00
|
|
|
static const char* kClassName() { return "MockTable"; }
|
|
|
|
const char* Name() const override { return kClassName(); }
|
2020-06-29 21:51:57 +00:00
|
|
|
using TableFactory::NewTableReader;
|
2016-07-20 18:23:31 +00:00
|
|
|
Status NewTableReader(
|
2020-06-29 21:51:57 +00:00
|
|
|
const ReadOptions& ro, const TableReaderOptions& table_reader_options,
|
2018-11-09 19:17:34 +00:00
|
|
|
std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
|
|
|
|
std::unique_ptr<TableReader>* table_reader,
|
2016-07-20 18:23:31 +00:00
|
|
|
bool prefetch_index_and_filter_in_cache = true) const override;
|
A new call back to TablePropertiesCollector to allow users know the entry is add, delete or merge
Summary:
Currently users have no idea a key is add, delete or merge from TablePropertiesCollector call back. Add a new function to add it.
Also refactor the codes so that
(1) make table property collector and internal table property collector two separate data structures with the later one now exposed
(2) table builders only receive internal table properties
Test Plan: Add cases in table_properties_collector_test to cover both of old and new ways of using TablePropertiesCollector.
Reviewers: yhchiang, igor.sugak, rven, igor
Reviewed By: rven, igor
Subscribers: meyering, yoshinorim, maykov, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D35373
2015-04-06 17:04:30 +00:00
|
|
|
TableBuilder* NewTableBuilder(
|
|
|
|
const TableBuilderOptions& table_builder_options,
|
2021-04-29 13:59:53 +00:00
|
|
|
WritableFileWriter* file) const override;
|
2014-10-29 00:52:32 +00:00
|
|
|
|
2014-11-14 19:35:48 +00:00
|
|
|
// This function will directly create mock table instead of going through
|
2015-09-02 20:58:22 +00:00
|
|
|
// MockTableBuilder. file_contents has to have a format of <internal_key,
|
|
|
|
// value>. Those key-value pairs will then be inserted into the mock table.
|
2014-11-14 19:35:48 +00:00
|
|
|
Status CreateMockTable(Env* env, const std::string& fname,
|
2020-10-01 17:08:52 +00:00
|
|
|
KVVector file_contents);
|
2014-11-14 19:35:48 +00:00
|
|
|
|
2020-09-14 23:59:00 +00:00
|
|
|
virtual std::string GetPrintableOptions() const override {
|
2014-10-29 00:52:32 +00:00
|
|
|
return std::string();
|
|
|
|
}
|
|
|
|
|
2020-07-22 18:03:29 +00:00
|
|
|
void SetCorruptionMode(MockCorruptionMode mode) { corrupt_mode_ = mode; }
|
Align compaction output file boundaries to the next level ones (#10655)
Summary:
Try to align the compaction output file boundaries to the next level ones
(grandparent level), to reduce the level compaction write-amplification.
In level compaction, there are "wasted" data at the beginning and end of the
output level files. Align the file boundary can avoid such "wasted" compaction.
With this PR, it tries to align the non-bottommost level file boundaries to its
next level ones. It may cut file when the file size is large enough (at least
50% of target_file_size) and not too large (2x target_file_size).
db_bench shows about 12.56% compaction reduction:
```
TEST_TMPDIR=/data/dbbench2 ./db_bench --benchmarks=fillrandom,readrandom -max_background_jobs=12 -num=400000000 -target_file_size_base=33554432
# baseline:
Flush(GB): cumulative 25.882, interval 7.216
Cumulative compaction: 285.90 GB write, 162.36 MB/s write, 269.68 GB read, 153.15 MB/s read, 2926.7 seconds
# with this change:
Flush(GB): cumulative 25.882, interval 7.753
Cumulative compaction: 249.97 GB write, 141.96 MB/s write, 233.74 GB read, 132.74 MB/s read, 2534.9 seconds
```
The compaction simulator shows a similar result (14% with 100G random data).
As a side effect, with this PR, the SST file size can exceed the
target_file_size, but is capped at 2x target_file_size. And there will be
smaller files. Here are file size statistics when loading 100GB with the target
file size 32MB:
```
baseline this_PR
count 1.656000e+03 1.705000e+03
mean 3.116062e+07 3.028076e+07
std 7.145242e+06 8.046139e+06
```
The feature is enabled by default, to revert to the old behavior disable it
with `AdvancedColumnFamilyOptions.level_compaction_dynamic_file_size = false`
Also includes https://github.com/facebook/rocksdb/issues/1963 to cut file before skippable grandparent file. Which is for
use case like user adding 2 or more non-overlapping data range at the same
time, it can reduce the overlapping of 2 datasets in the lower levels.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10655
Reviewed By: cbi42
Differential Revision: D39552321
Pulled By: jay-zhuang
fbshipit-source-id: 640d15f159ab0cd973f2426cfc3af266fc8bdde2
2022-09-30 02:43:55 +00:00
|
|
|
|
|
|
|
void SetKeyValueSize(size_t size) { key_value_size_ = size; }
|
2014-10-29 00:52:32 +00:00
|
|
|
// This function will assert that only a single file exists and that the
|
|
|
|
// contents are equal to file_contents
|
2020-10-01 17:08:52 +00:00
|
|
|
void AssertSingleFile(const KVVector& file_contents);
|
Align compaction output file boundaries to the next level ones (#10655)
Summary:
Try to align the compaction output file boundaries to the next level ones
(grandparent level), to reduce the level compaction write-amplification.
In level compaction, there are "wasted" data at the beginning and end of the
output level files. Align the file boundary can avoid such "wasted" compaction.
With this PR, it tries to align the non-bottommost level file boundaries to its
next level ones. It may cut file when the file size is large enough (at least
50% of target_file_size) and not too large (2x target_file_size).
db_bench shows about 12.56% compaction reduction:
```
TEST_TMPDIR=/data/dbbench2 ./db_bench --benchmarks=fillrandom,readrandom -max_background_jobs=12 -num=400000000 -target_file_size_base=33554432
# baseline:
Flush(GB): cumulative 25.882, interval 7.216
Cumulative compaction: 285.90 GB write, 162.36 MB/s write, 269.68 GB read, 153.15 MB/s read, 2926.7 seconds
# with this change:
Flush(GB): cumulative 25.882, interval 7.753
Cumulative compaction: 249.97 GB write, 141.96 MB/s write, 233.74 GB read, 132.74 MB/s read, 2534.9 seconds
```
The compaction simulator shows a similar result (14% with 100G random data).
As a side effect, with this PR, the SST file size can exceed the
target_file_size, but is capped at 2x target_file_size. And there will be
smaller files. Here are file size statistics when loading 100GB with the target
file size 32MB:
```
baseline this_PR
count 1.656000e+03 1.705000e+03
mean 3.116062e+07 3.028076e+07
std 7.145242e+06 8.046139e+06
```
The feature is enabled by default, to revert to the old behavior disable it
with `AdvancedColumnFamilyOptions.level_compaction_dynamic_file_size = false`
Also includes https://github.com/facebook/rocksdb/issues/1963 to cut file before skippable grandparent file. Which is for
use case like user adding 2 or more non-overlapping data range at the same
time, it can reduce the overlapping of 2 datasets in the lower levels.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10655
Reviewed By: cbi42
Differential Revision: D39552321
Pulled By: jay-zhuang
fbshipit-source-id: 640d15f159ab0cd973f2426cfc3af266fc8bdde2
2022-09-30 02:43:55 +00:00
|
|
|
void AssertLatestFiles(const std::vector<KVVector>& files_contents);
|
2014-10-29 00:52:32 +00:00
|
|
|
|
|
|
|
private:
|
2020-12-24 00:54:05 +00:00
|
|
|
Status GetAndWriteNextID(WritableFileWriter* file, uint32_t* id) const;
|
|
|
|
Status GetIDFromFile(RandomAccessFileReader* file, uint32_t* id) const;
|
2014-10-29 00:52:32 +00:00
|
|
|
|
|
|
|
mutable MockTableFileSystem file_system_;
|
|
|
|
mutable std::atomic<uint32_t> next_id_;
|
2020-07-22 18:03:29 +00:00
|
|
|
MockCorruptionMode corrupt_mode_;
|
Align compaction output file boundaries to the next level ones (#10655)
Summary:
Try to align the compaction output file boundaries to the next level ones
(grandparent level), to reduce the level compaction write-amplification.
In level compaction, there are "wasted" data at the beginning and end of the
output level files. Align the file boundary can avoid such "wasted" compaction.
With this PR, it tries to align the non-bottommost level file boundaries to its
next level ones. It may cut file when the file size is large enough (at least
50% of target_file_size) and not too large (2x target_file_size).
db_bench shows about 12.56% compaction reduction:
```
TEST_TMPDIR=/data/dbbench2 ./db_bench --benchmarks=fillrandom,readrandom -max_background_jobs=12 -num=400000000 -target_file_size_base=33554432
# baseline:
Flush(GB): cumulative 25.882, interval 7.216
Cumulative compaction: 285.90 GB write, 162.36 MB/s write, 269.68 GB read, 153.15 MB/s read, 2926.7 seconds
# with this change:
Flush(GB): cumulative 25.882, interval 7.753
Cumulative compaction: 249.97 GB write, 141.96 MB/s write, 233.74 GB read, 132.74 MB/s read, 2534.9 seconds
```
The compaction simulator shows a similar result (14% with 100G random data).
As a side effect, with this PR, the SST file size can exceed the
target_file_size, but is capped at 2x target_file_size. And there will be
smaller files. Here are file size statistics when loading 100GB with the target
file size 32MB:
```
baseline this_PR
count 1.656000e+03 1.705000e+03
mean 3.116062e+07 3.028076e+07
std 7.145242e+06 8.046139e+06
```
The feature is enabled by default, to revert to the old behavior disable it
with `AdvancedColumnFamilyOptions.level_compaction_dynamic_file_size = false`
Also includes https://github.com/facebook/rocksdb/issues/1963 to cut file before skippable grandparent file. Which is for
use case like user adding 2 or more non-overlapping data range at the same
time, it can reduce the overlapping of 2 datasets in the lower levels.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10655
Reviewed By: cbi42
Differential Revision: D39552321
Pulled By: jay-zhuang
fbshipit-source-id: 640d15f159ab0cd973f2426cfc3af266fc8bdde2
2022-09-30 02:43:55 +00:00
|
|
|
|
|
|
|
size_t key_value_size_ = 1;
|
2014-10-29 00:52:32 +00:00
|
|
|
};
|
|
|
|
|
2014-11-14 19:35:48 +00:00
|
|
|
} // namespace mock
|
2020-02-20 20:07:53 +00:00
|
|
|
} // namespace ROCKSDB_NAMESPACE
|