rocksdb/table/mock_table.h

146 lines
4.9 KiB
C
Raw Normal View History

// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#pragma once
#include <algorithm>
2014-10-29 01:10:55 +00:00
#include <atomic>
#include <map>
#include <memory>
#include <set>
#include <string>
#include <utility>
#include "db/version_edit.h"
#include "port/port.h"
#include "rocksdb/comparator.h"
#include "rocksdb/io_status.h"
#include "rocksdb/table.h"
#include "table/internal_iterator.h"
#include "table/table_builder.h"
#include "table/table_reader.h"
#include "test_util/testharness.h"
#include "test_util/testutil.h"
#include "util/kv_map.h"
#include "util/mutexlock.h"
namespace ROCKSDB_NAMESPACE {
namespace mock {
using KVPair = std::pair<std::string, std::string>;
using KVVector = std::vector<KVPair>;
KVVector MakeMockFile(std::initializer_list<KVPair> l = {});
void SortKVVector(KVVector* kv_vector,
const Comparator* ucmp = BytewiseComparator());
struct MockTableFileSystem {
port::Mutex mutex;
std::map<uint32_t, KVVector> files;
};
class MockTableFactory : public TableFactory {
public:
enum MockCorruptionMode {
kCorruptNone,
kCorruptKey,
kCorruptValue,
kCorruptReorderKey,
};
MockTableFactory();
static const char* kClassName() { return "MockTable"; }
const char* Name() const override { return kClassName(); }
using TableFactory::NewTableReader;
Status NewTableReader(
const ReadOptions& ro, const TableReaderOptions& table_reader_options,
std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
std::unique_ptr<TableReader>* table_reader,
bool prefetch_index_and_filter_in_cache = true) const override;
TableBuilder* NewTableBuilder(
const TableBuilderOptions& table_builder_options,
WritableFileWriter* file) const override;
// This function will directly create mock table instead of going through
// MockTableBuilder. file_contents has to have a format of <internal_key,
// value>. Those key-value pairs will then be inserted into the mock table.
Status CreateMockTable(Env* env, const std::string& fname,
KVVector file_contents);
std::string GetPrintableOptions() const override { return std::string(); }
void SetCorruptionMode(MockCorruptionMode mode) { corrupt_mode_ = mode; }
Align compaction output file boundaries to the next level ones (#10655) Summary: Try to align the compaction output file boundaries to the next level ones (grandparent level), to reduce the level compaction write-amplification. In level compaction, there are "wasted" data at the beginning and end of the output level files. Align the file boundary can avoid such "wasted" compaction. With this PR, it tries to align the non-bottommost level file boundaries to its next level ones. It may cut file when the file size is large enough (at least 50% of target_file_size) and not too large (2x target_file_size). db_bench shows about 12.56% compaction reduction: ``` TEST_TMPDIR=/data/dbbench2 ./db_bench --benchmarks=fillrandom,readrandom -max_background_jobs=12 -num=400000000 -target_file_size_base=33554432 # baseline: Flush(GB): cumulative 25.882, interval 7.216 Cumulative compaction: 285.90 GB write, 162.36 MB/s write, 269.68 GB read, 153.15 MB/s read, 2926.7 seconds # with this change: Flush(GB): cumulative 25.882, interval 7.753 Cumulative compaction: 249.97 GB write, 141.96 MB/s write, 233.74 GB read, 132.74 MB/s read, 2534.9 seconds ``` The compaction simulator shows a similar result (14% with 100G random data). As a side effect, with this PR, the SST file size can exceed the target_file_size, but is capped at 2x target_file_size. And there will be smaller files. Here are file size statistics when loading 100GB with the target file size 32MB: ``` baseline this_PR count 1.656000e+03 1.705000e+03 mean 3.116062e+07 3.028076e+07 std 7.145242e+06 8.046139e+06 ``` The feature is enabled by default, to revert to the old behavior disable it with `AdvancedColumnFamilyOptions.level_compaction_dynamic_file_size = false` Also includes https://github.com/facebook/rocksdb/issues/1963 to cut file before skippable grandparent file. Which is for use case like user adding 2 or more non-overlapping data range at the same time, it can reduce the overlapping of 2 datasets in the lower levels. Pull Request resolved: https://github.com/facebook/rocksdb/pull/10655 Reviewed By: cbi42 Differential Revision: D39552321 Pulled By: jay-zhuang fbshipit-source-id: 640d15f159ab0cd973f2426cfc3af266fc8bdde2
2022-09-30 02:43:55 +00:00
void SetKeyValueSize(size_t size) { key_value_size_ = size; }
// This function will assert that only a single file exists and that the
// contents are equal to file_contents
void AssertSingleFile(const KVVector& file_contents);
Align compaction output file boundaries to the next level ones (#10655) Summary: Try to align the compaction output file boundaries to the next level ones (grandparent level), to reduce the level compaction write-amplification. In level compaction, there are "wasted" data at the beginning and end of the output level files. Align the file boundary can avoid such "wasted" compaction. With this PR, it tries to align the non-bottommost level file boundaries to its next level ones. It may cut file when the file size is large enough (at least 50% of target_file_size) and not too large (2x target_file_size). db_bench shows about 12.56% compaction reduction: ``` TEST_TMPDIR=/data/dbbench2 ./db_bench --benchmarks=fillrandom,readrandom -max_background_jobs=12 -num=400000000 -target_file_size_base=33554432 # baseline: Flush(GB): cumulative 25.882, interval 7.216 Cumulative compaction: 285.90 GB write, 162.36 MB/s write, 269.68 GB read, 153.15 MB/s read, 2926.7 seconds # with this change: Flush(GB): cumulative 25.882, interval 7.753 Cumulative compaction: 249.97 GB write, 141.96 MB/s write, 233.74 GB read, 132.74 MB/s read, 2534.9 seconds ``` The compaction simulator shows a similar result (14% with 100G random data). As a side effect, with this PR, the SST file size can exceed the target_file_size, but is capped at 2x target_file_size. And there will be smaller files. Here are file size statistics when loading 100GB with the target file size 32MB: ``` baseline this_PR count 1.656000e+03 1.705000e+03 mean 3.116062e+07 3.028076e+07 std 7.145242e+06 8.046139e+06 ``` The feature is enabled by default, to revert to the old behavior disable it with `AdvancedColumnFamilyOptions.level_compaction_dynamic_file_size = false` Also includes https://github.com/facebook/rocksdb/issues/1963 to cut file before skippable grandparent file. Which is for use case like user adding 2 or more non-overlapping data range at the same time, it can reduce the overlapping of 2 datasets in the lower levels. Pull Request resolved: https://github.com/facebook/rocksdb/pull/10655 Reviewed By: cbi42 Differential Revision: D39552321 Pulled By: jay-zhuang fbshipit-source-id: 640d15f159ab0cd973f2426cfc3af266fc8bdde2
2022-09-30 02:43:55 +00:00
void AssertLatestFiles(const std::vector<KVVector>& files_contents);
std::unique_ptr<TableFactory> Clone() const override {
return nullptr; // Not implemented
}
private:
Status GetAndWriteNextID(WritableFileWriter* file, uint32_t* id) const;
Status GetIDFromFile(RandomAccessFileReader* file, uint32_t* id) const;
mutable MockTableFileSystem file_system_;
mutable std::atomic<uint32_t> next_id_;
MockCorruptionMode corrupt_mode_;
Align compaction output file boundaries to the next level ones (#10655) Summary: Try to align the compaction output file boundaries to the next level ones (grandparent level), to reduce the level compaction write-amplification. In level compaction, there are "wasted" data at the beginning and end of the output level files. Align the file boundary can avoid such "wasted" compaction. With this PR, it tries to align the non-bottommost level file boundaries to its next level ones. It may cut file when the file size is large enough (at least 50% of target_file_size) and not too large (2x target_file_size). db_bench shows about 12.56% compaction reduction: ``` TEST_TMPDIR=/data/dbbench2 ./db_bench --benchmarks=fillrandom,readrandom -max_background_jobs=12 -num=400000000 -target_file_size_base=33554432 # baseline: Flush(GB): cumulative 25.882, interval 7.216 Cumulative compaction: 285.90 GB write, 162.36 MB/s write, 269.68 GB read, 153.15 MB/s read, 2926.7 seconds # with this change: Flush(GB): cumulative 25.882, interval 7.753 Cumulative compaction: 249.97 GB write, 141.96 MB/s write, 233.74 GB read, 132.74 MB/s read, 2534.9 seconds ``` The compaction simulator shows a similar result (14% with 100G random data). As a side effect, with this PR, the SST file size can exceed the target_file_size, but is capped at 2x target_file_size. And there will be smaller files. Here are file size statistics when loading 100GB with the target file size 32MB: ``` baseline this_PR count 1.656000e+03 1.705000e+03 mean 3.116062e+07 3.028076e+07 std 7.145242e+06 8.046139e+06 ``` The feature is enabled by default, to revert to the old behavior disable it with `AdvancedColumnFamilyOptions.level_compaction_dynamic_file_size = false` Also includes https://github.com/facebook/rocksdb/issues/1963 to cut file before skippable grandparent file. Which is for use case like user adding 2 or more non-overlapping data range at the same time, it can reduce the overlapping of 2 datasets in the lower levels. Pull Request resolved: https://github.com/facebook/rocksdb/pull/10655 Reviewed By: cbi42 Differential Revision: D39552321 Pulled By: jay-zhuang fbshipit-source-id: 640d15f159ab0cd973f2426cfc3af266fc8bdde2
2022-09-30 02:43:55 +00:00
size_t key_value_size_ = 1;
};
Record newest_key_time as a table property (#13083) Summary: This PR does two things: 1. Adds a new table property `newest_key_time` 2. Uses this property to improve TTL and temperature change compaction. ### Context The current `creation_time` table property should really be named `oldest_ancestor_time`. For flush output files, this is the oldest key time in the file. For compaction output files, this is the minimum among all oldest key times in the input files. The problem with using the oldest ancestor time for TTL compaction is that we may end up dropping files earlier than we should. What we really want is the newest (i.e. "youngest") key time. Right now we take a roundabout way to estimate this value -- we take the value of the _oldest_ key time for the _next_ (newer) SST file. This is also why the current code has checks for `index >= 1`. Our new property `newest_key_time` is set to the file creation time during flushes, and the max over all input files for compactions. There were some additional smaller changes that I had to make for testing purposes: - Refactoring the mock table reader to support specifying my own table properties - Refactoring out a test utility method `GetLevelFileMetadatas` that would otherwise be copy/pasted in 3 places Credit to cbi42 for the problem explanation and proposed solution ### Testing - Added a dedicated unit test to my `newest_key_time` logic in isolation (i.e. are we populating the property on flush and compaction) - Updated the existing unit tests (for TTL/temperate change compaction), which were comprehensive enough to break when I first made my code changes. I removed the test setup code which set the file metadata `oldest_ancestor_time`, so we know we are actually only using the new table property instead. Pull Request resolved: https://github.com/facebook/rocksdb/pull/13083 Reviewed By: cbi42 Differential Revision: D65298604 Pulled By: archang19 fbshipit-source-id: 898ef91b692ab33f5129a2a16b64ecadd4c32432
2024-11-01 17:08:35 +00:00
class MockTableReader : public TableReader {
public:
explicit MockTableReader(const mock::KVVector& table) : table_(table) {
tp_.num_entries = table_.size();
tp_.num_range_deletions = 0;
tp_.raw_key_size = 1;
tp_.raw_value_size = 1;
}
explicit MockTableReader(const mock::KVVector& table,
const TableProperties& tp)
: table_(table), tp_(tp) {}
virtual InternalIterator* NewIterator(
const ReadOptions&, const SliceTransform* prefix_extractor, Arena* arena,
bool skip_filters, TableReaderCaller caller,
size_t compaction_readahead_size = 0,
bool allow_unprepared_value = false) override;
virtual Status Get(const ReadOptions& readOptions, const Slice& key,
GetContext* get_context,
const SliceTransform* prefix_extractor,
bool skip_filters = false) override;
virtual uint64_t ApproximateOffsetOf(const ReadOptions& /*read_options*/,
const Slice& /*key*/,
TableReaderCaller /*caller*/) override {
return 0;
}
virtual uint64_t ApproximateSize(const ReadOptions& /*read_options*/,
const Slice& /*start*/, const Slice& /*end*/,
TableReaderCaller /*caller*/) override {
return 0;
}
virtual size_t ApproximateMemoryUsage() const override { return 0; }
virtual void SetupForCompaction() override {}
virtual std::shared_ptr<const TableProperties> GetTableProperties()
const override {
return std::make_shared<const TableProperties>(tp_);
}
~MockTableReader() = default;
private:
const KVVector& table_;
TableProperties tp_;
};
} // namespace mock
} // namespace ROCKSDB_NAMESPACE