mirror of
https://github.com/facebook/rocksdb.git
synced 2024-11-27 20:43:57 +00:00
f3cc66632b
Summary: Try to align the compaction output file boundaries to the next level ones (grandparent level), to reduce the level compaction write-amplification. In level compaction, there are "wasted" data at the beginning and end of the output level files. Align the file boundary can avoid such "wasted" compaction. With this PR, it tries to align the non-bottommost level file boundaries to its next level ones. It may cut file when the file size is large enough (at least 50% of target_file_size) and not too large (2x target_file_size). db_bench shows about 12.56% compaction reduction: ``` TEST_TMPDIR=/data/dbbench2 ./db_bench --benchmarks=fillrandom,readrandom -max_background_jobs=12 -num=400000000 -target_file_size_base=33554432 # baseline: Flush(GB): cumulative 25.882, interval 7.216 Cumulative compaction: 285.90 GB write, 162.36 MB/s write, 269.68 GB read, 153.15 MB/s read, 2926.7 seconds # with this change: Flush(GB): cumulative 25.882, interval 7.753 Cumulative compaction: 249.97 GB write, 141.96 MB/s write, 233.74 GB read, 132.74 MB/s read, 2534.9 seconds ``` The compaction simulator shows a similar result (14% with 100G random data). As a side effect, with this PR, the SST file size can exceed the target_file_size, but is capped at 2x target_file_size. And there will be smaller files. Here are file size statistics when loading 100GB with the target file size 32MB: ``` baseline this_PR count 1.656000e+03 1.705000e+03 mean 3.116062e+07 3.028076e+07 std 7.145242e+06 8.046139e+06 ``` The feature is enabled by default, to revert to the old behavior disable it with `AdvancedColumnFamilyOptions.level_compaction_dynamic_file_size = false` Also includes https://github.com/facebook/rocksdb/issues/1963 to cut file before skippable grandparent file. Which is for use case like user adding 2 or more non-overlapping data range at the same time, it can reduce the overlapping of 2 datasets in the lower levels. Pull Request resolved: https://github.com/facebook/rocksdb/pull/10655 Reviewed By: cbi42 Differential Revision: D39552321 Pulled By: jay-zhuang fbshipit-source-id: 640d15f159ab0cd973f2426cfc3af266fc8bdde2
345 lines
11 KiB
C++
345 lines
11 KiB
C++
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
#include "table/mock_table.h"
|
|
|
|
#include "db/dbformat.h"
|
|
#include "env/composite_env_wrapper.h"
|
|
#include "file/random_access_file_reader.h"
|
|
#include "port/port.h"
|
|
#include "rocksdb/table_properties.h"
|
|
#include "table/get_context.h"
|
|
#include "util/coding.h"
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
namespace mock {
|
|
|
|
KVVector MakeMockFile(std::initializer_list<KVPair> l) { return KVVector(l); }
|
|
|
|
void SortKVVector(KVVector* kv_vector, const Comparator* ucmp) {
|
|
InternalKeyComparator icmp(ucmp);
|
|
std::sort(kv_vector->begin(), kv_vector->end(),
|
|
[icmp](KVPair a, KVPair b) -> bool {
|
|
return icmp.Compare(a.first, b.first) < 0;
|
|
});
|
|
}
|
|
|
|
class MockTableReader : public TableReader {
|
|
public:
|
|
explicit MockTableReader(const KVVector& table) : table_(table) {}
|
|
|
|
InternalIterator* NewIterator(const ReadOptions&,
|
|
const SliceTransform* prefix_extractor,
|
|
Arena* arena, bool skip_filters,
|
|
TableReaderCaller caller,
|
|
size_t compaction_readahead_size = 0,
|
|
bool allow_unprepared_value = false) override;
|
|
|
|
Status Get(const ReadOptions& readOptions, const Slice& key,
|
|
GetContext* get_context, const SliceTransform* prefix_extractor,
|
|
bool skip_filters = false) override;
|
|
|
|
uint64_t ApproximateOffsetOf(const Slice& /*key*/,
|
|
TableReaderCaller /*caller*/) override {
|
|
return 0;
|
|
}
|
|
|
|
uint64_t ApproximateSize(const Slice& /*start*/, const Slice& /*end*/,
|
|
TableReaderCaller /*caller*/) override {
|
|
return 0;
|
|
}
|
|
|
|
size_t ApproximateMemoryUsage() const override { return 0; }
|
|
|
|
void SetupForCompaction() override {}
|
|
|
|
std::shared_ptr<const TableProperties> GetTableProperties() const override;
|
|
|
|
~MockTableReader() {}
|
|
|
|
private:
|
|
const KVVector& table_;
|
|
};
|
|
|
|
class MockTableIterator : public InternalIterator {
|
|
public:
|
|
explicit MockTableIterator(const KVVector& table) : table_(table) {
|
|
itr_ = table_.end();
|
|
}
|
|
|
|
bool Valid() const override { return itr_ != table_.end(); }
|
|
|
|
void SeekToFirst() override { itr_ = table_.begin(); }
|
|
|
|
void SeekToLast() override {
|
|
itr_ = table_.end();
|
|
--itr_;
|
|
}
|
|
|
|
void Seek(const Slice& target) override {
|
|
KVPair target_pair(target.ToString(), "");
|
|
InternalKeyComparator icmp(BytewiseComparator());
|
|
itr_ = std::lower_bound(table_.begin(), table_.end(), target_pair,
|
|
[icmp](KVPair a, KVPair b) -> bool {
|
|
return icmp.Compare(a.first, b.first) < 0;
|
|
});
|
|
}
|
|
|
|
void SeekForPrev(const Slice& target) override {
|
|
KVPair target_pair(target.ToString(), "");
|
|
InternalKeyComparator icmp(BytewiseComparator());
|
|
itr_ = std::upper_bound(table_.begin(), table_.end(), target_pair,
|
|
[icmp](KVPair a, KVPair b) -> bool {
|
|
return icmp.Compare(a.first, b.first) < 0;
|
|
});
|
|
Prev();
|
|
}
|
|
|
|
void Next() override { ++itr_; }
|
|
|
|
void Prev() override {
|
|
if (itr_ == table_.begin()) {
|
|
itr_ = table_.end();
|
|
} else {
|
|
--itr_;
|
|
}
|
|
}
|
|
|
|
Slice key() const override { return Slice(itr_->first); }
|
|
|
|
Slice value() const override { return Slice(itr_->second); }
|
|
|
|
Status status() const override { return Status::OK(); }
|
|
|
|
private:
|
|
const KVVector& table_;
|
|
KVVector::const_iterator itr_;
|
|
};
|
|
|
|
class MockTableBuilder : public TableBuilder {
|
|
public:
|
|
MockTableBuilder(uint32_t id, MockTableFileSystem* file_system,
|
|
MockTableFactory::MockCorruptionMode corrupt_mode =
|
|
MockTableFactory::kCorruptNone,
|
|
size_t key_value_size = 1)
|
|
: id_(id),
|
|
file_system_(file_system),
|
|
corrupt_mode_(corrupt_mode),
|
|
key_value_size_(key_value_size) {
|
|
table_ = MakeMockFile({});
|
|
}
|
|
|
|
// REQUIRES: Either Finish() or Abandon() has been called.
|
|
~MockTableBuilder() {}
|
|
|
|
// Add key,value to the table being constructed.
|
|
// REQUIRES: key is after any previously added key according to comparator.
|
|
// REQUIRES: Finish(), Abandon() have not been called
|
|
void Add(const Slice& key, const Slice& value) override {
|
|
if (corrupt_mode_ == MockTableFactory::kCorruptValue) {
|
|
// Corrupt the value
|
|
table_.push_back({key.ToString(), value.ToString() + " "});
|
|
corrupt_mode_ = MockTableFactory::kCorruptNone;
|
|
} else if (corrupt_mode_ == MockTableFactory::kCorruptKey) {
|
|
table_.push_back({key.ToString() + " ", value.ToString()});
|
|
corrupt_mode_ = MockTableFactory::kCorruptNone;
|
|
} else if (corrupt_mode_ == MockTableFactory::kCorruptReorderKey) {
|
|
if (prev_key_.empty()) {
|
|
prev_key_ = key.ToString();
|
|
prev_value_ = value.ToString();
|
|
} else {
|
|
table_.push_back({key.ToString(), value.ToString()});
|
|
table_.push_back({prev_key_, prev_value_});
|
|
corrupt_mode_ = MockTableFactory::kCorruptNone;
|
|
}
|
|
} else {
|
|
table_.push_back({key.ToString(), value.ToString()});
|
|
}
|
|
}
|
|
|
|
// Return non-ok iff some error has been detected.
|
|
Status status() const override { return Status::OK(); }
|
|
|
|
// Return non-ok iff some error happens during IO.
|
|
IOStatus io_status() const override { return IOStatus::OK(); }
|
|
|
|
Status Finish() override {
|
|
MutexLock lock_guard(&file_system_->mutex);
|
|
file_system_->files.insert({id_, table_});
|
|
return Status::OK();
|
|
}
|
|
|
|
void Abandon() override {}
|
|
|
|
uint64_t NumEntries() const override { return table_.size(); }
|
|
|
|
uint64_t FileSize() const override { return table_.size() * key_value_size_; }
|
|
|
|
TableProperties GetTableProperties() const override {
|
|
return TableProperties();
|
|
}
|
|
|
|
// Get file checksum
|
|
std::string GetFileChecksum() const override { return kUnknownFileChecksum; }
|
|
// Get file checksum function name
|
|
const char* GetFileChecksumFuncName() const override {
|
|
return kUnknownFileChecksumFuncName;
|
|
}
|
|
|
|
private:
|
|
uint32_t id_;
|
|
std::string prev_key_;
|
|
std::string prev_value_;
|
|
MockTableFileSystem* file_system_;
|
|
int corrupt_mode_;
|
|
KVVector table_;
|
|
size_t key_value_size_;
|
|
};
|
|
|
|
InternalIterator* MockTableReader::NewIterator(
|
|
const ReadOptions&, const SliceTransform* /* prefix_extractor */,
|
|
Arena* /*arena*/, bool /*skip_filters*/, TableReaderCaller /*caller*/,
|
|
size_t /*compaction_readahead_size*/, bool /* allow_unprepared_value */) {
|
|
return new MockTableIterator(table_);
|
|
}
|
|
|
|
Status MockTableReader::Get(const ReadOptions&, const Slice& key,
|
|
GetContext* get_context,
|
|
const SliceTransform* /*prefix_extractor*/,
|
|
bool /*skip_filters*/) {
|
|
std::unique_ptr<MockTableIterator> iter(new MockTableIterator(table_));
|
|
for (iter->Seek(key); iter->Valid(); iter->Next()) {
|
|
ParsedInternalKey parsed_key;
|
|
Status pik_status =
|
|
ParseInternalKey(iter->key(), &parsed_key, true /* log_err_key */);
|
|
if (!pik_status.ok()) {
|
|
return pik_status;
|
|
}
|
|
|
|
bool dont_care __attribute__((__unused__));
|
|
if (!get_context->SaveValue(parsed_key, iter->value(), &dont_care)) {
|
|
break;
|
|
}
|
|
}
|
|
return Status::OK();
|
|
}
|
|
|
|
std::shared_ptr<const TableProperties> MockTableReader::GetTableProperties()
|
|
const {
|
|
return std::shared_ptr<const TableProperties>(new TableProperties());
|
|
}
|
|
|
|
MockTableFactory::MockTableFactory()
|
|
: next_id_(1), corrupt_mode_(MockTableFactory::kCorruptNone) {}
|
|
|
|
Status MockTableFactory::NewTableReader(
|
|
const ReadOptions& /*ro*/,
|
|
const TableReaderOptions& /*table_reader_options*/,
|
|
std::unique_ptr<RandomAccessFileReader>&& file, uint64_t /*file_size*/,
|
|
std::unique_ptr<TableReader>* table_reader,
|
|
bool /*prefetch_index_and_filter_in_cache*/) const {
|
|
uint32_t id;
|
|
Status s = GetIDFromFile(file.get(), &id);
|
|
if (!s.ok()) {
|
|
return s;
|
|
}
|
|
|
|
MutexLock lock_guard(&file_system_.mutex);
|
|
|
|
auto it = file_system_.files.find(id);
|
|
if (it == file_system_.files.end()) {
|
|
return Status::IOError("Mock file not found");
|
|
}
|
|
|
|
table_reader->reset(new MockTableReader(it->second));
|
|
|
|
return Status::OK();
|
|
}
|
|
|
|
TableBuilder* MockTableFactory::NewTableBuilder(
|
|
const TableBuilderOptions& /*table_builder_options*/,
|
|
WritableFileWriter* file) const {
|
|
uint32_t id;
|
|
Status s = GetAndWriteNextID(file, &id);
|
|
assert(s.ok());
|
|
|
|
return new MockTableBuilder(id, &file_system_, corrupt_mode_,
|
|
key_value_size_);
|
|
}
|
|
|
|
Status MockTableFactory::CreateMockTable(Env* env, const std::string& fname,
|
|
KVVector file_contents) {
|
|
std::unique_ptr<WritableFileWriter> file_writer;
|
|
Status s = WritableFileWriter::Create(env->GetFileSystem(), fname,
|
|
FileOptions(), &file_writer, nullptr);
|
|
if (!s.ok()) {
|
|
return s;
|
|
}
|
|
uint32_t id;
|
|
s = GetAndWriteNextID(file_writer.get(), &id);
|
|
if (s.ok()) {
|
|
file_system_.files.insert({id, std::move(file_contents)});
|
|
}
|
|
return s;
|
|
}
|
|
|
|
Status MockTableFactory::GetAndWriteNextID(WritableFileWriter* file,
|
|
uint32_t* next_id) const {
|
|
*next_id = next_id_.fetch_add(1);
|
|
char buf[4];
|
|
EncodeFixed32(buf, *next_id);
|
|
return file->Append(Slice(buf, 4));
|
|
}
|
|
|
|
Status MockTableFactory::GetIDFromFile(RandomAccessFileReader* file,
|
|
uint32_t* id) const {
|
|
char buf[4];
|
|
Slice result;
|
|
Status s = file->Read(IOOptions(), 0, 4, &result, buf, nullptr,
|
|
Env::IO_TOTAL /* rate_limiter_priority */);
|
|
assert(result.size() == 4);
|
|
*id = DecodeFixed32(buf);
|
|
return s;
|
|
}
|
|
|
|
void MockTableFactory::AssertSingleFile(const KVVector& file_contents) {
|
|
ASSERT_EQ(file_system_.files.size(), 1U);
|
|
ASSERT_EQ(file_contents, file_system_.files.begin()->second);
|
|
}
|
|
|
|
void MockTableFactory::AssertLatestFiles(
|
|
const std::vector<KVVector>& files_contents) {
|
|
ASSERT_GE(file_system_.files.size(), files_contents.size());
|
|
auto it = file_system_.files.rbegin();
|
|
for (auto expect = files_contents.rbegin(); expect != files_contents.rend();
|
|
expect++, it++) {
|
|
ASSERT_TRUE(it != file_system_.files.rend());
|
|
if (*expect != it->second) {
|
|
std::cout << "Wrong content! Content of file, expect:" << std::endl;
|
|
for (const auto& kv : *expect) {
|
|
ParsedInternalKey ikey;
|
|
std::string key, value;
|
|
std::tie(key, value) = kv;
|
|
ASSERT_OK(ParseInternalKey(Slice(key), &ikey, true /* log_err_key */));
|
|
std::cout << ikey.DebugString(true, false) << " -> " << value
|
|
<< std::endl;
|
|
}
|
|
std::cout << "actual:" << std::endl;
|
|
for (const auto& kv : it->second) {
|
|
ParsedInternalKey ikey;
|
|
std::string key, value;
|
|
std::tie(key, value) = kv;
|
|
ASSERT_OK(ParseInternalKey(Slice(key), &ikey, true /* log_err_key */));
|
|
std::cout << ikey.DebugString(true, false) << " -> " << value
|
|
<< std::endl;
|
|
}
|
|
FAIL();
|
|
}
|
|
}
|
|
}
|
|
|
|
} // namespace mock
|
|
} // namespace ROCKSDB_NAMESPACE
|