mirror of https://github.com/facebook/rocksdb.git
For ApproximateSizes, pro-rate table metadata size over data blocks (#6784)
Summary: The implementation of GetApproximateSizes was inconsistent in its treatment of the size of non-data blocks of SST files, sometimes including and sometimes now. This was at its worst with large portion of table file used by filters and querying a small range that crossed a table boundary: the size estimate would include large filter size. It's conceivable that someone might want only to know the size in terms of data blocks, but I believe that's unlikely enough to ignore for now. Similarly, there's no evidence the internal function AppoximateOffsetOf is used for anything other than a one-sided ApproximateSize, so I intend to refactor to remove redundancy in a follow-up commit. So to fix this, GetApproximateSizes (and implementation details ApproximateSize and ApproximateOffsetOf) now consistently include in their returned sizes a portion of table file metadata (incl filters and indexes) based on the size portion of the data blocks in range. In other words, if a key range covers data blocks that are X% by size of all the table's data blocks, returned approximate size is X% of the total file size. It would technically be more accurate to attribute metadata based on number of keys, but that's not computationally efficient with data available and rarely a meaningful difference. Also includes miscellaneous comment improvements / clarifications. Also included is a new approximatesizerandom benchmark for db_bench. No significant performance difference seen with this change, whether ~700 ops/sec with cache_index_and_filter_blocks and small cache or ~150k ops/sec without cache_index_and_filter_blocks. Pull Request resolved: https://github.com/facebook/rocksdb/pull/6784 Test Plan: Test added to DBTest.ApproximateSizesFilesWithErrorMargin. Old code running new test... [ RUN ] DBTest.ApproximateSizesFilesWithErrorMargin db/db_test.cc:1562: Failure Expected: (size) <= (11 * 100), actual: 9478 vs 1100 Other tests updated to reflect consistent accounting of metadata. Reviewed By: siying Differential Revision: D21334706 Pulled By: pdillinger fbshipit-source-id: 6f86870e45213334fedbe9c73b4ebb1d8d611185
This commit is contained in:
parent
298b00a396
commit
14eca6bf04
|
@ -10,6 +10,7 @@
|
||||||
* Fix a use-after-free bug in best-efforts recovery. column_family_memtables_ needs to point to valid ColumnFamilySet.
|
* Fix a use-after-free bug in best-efforts recovery. column_family_memtables_ needs to point to valid ColumnFamilySet.
|
||||||
* Let best-efforts recovery ignore corrupted files during table loading.
|
* Let best-efforts recovery ignore corrupted files during table loading.
|
||||||
* Fix corrupt key read from ingested file when iterator direction switches from reverse to forward at a key that is a prefix of another key in the same file. It is only possible in files with a non-zero global seqno.
|
* Fix corrupt key read from ingested file when iterator direction switches from reverse to forward at a key that is a prefix of another key in the same file. It is only possible in files with a non-zero global seqno.
|
||||||
|
* Fix abnormally large estimate from GetApproximateSizes when a range starts near the end of one SST file and near the beginning of another. Now GetApproximateSizes consistently and fairly includes the size of SST metadata in addition to data blocks, attributing metadata proportionally among the data blocks based on their size.
|
||||||
|
|
||||||
### Public API Change
|
### Public API Change
|
||||||
* Flush(..., column_family) may return Status::ColumnFamilyDropped() instead of Status::InvalidArgument() if column_family is dropped while processing the flush request.
|
* Flush(..., column_family) may return Status::ColumnFamilyDropped() instead of Status::InvalidArgument() if column_family is dropped while processing the flush request.
|
||||||
|
|
|
@ -1489,18 +1489,24 @@ TEST_F(DBTest, ApproximateSizesMemTable) {
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_F(DBTest, ApproximateSizesFilesWithErrorMargin) {
|
TEST_F(DBTest, ApproximateSizesFilesWithErrorMargin) {
|
||||||
|
// Roughly 4 keys per data block, 1000 keys per file,
|
||||||
|
// with filter substantially larger than a data block
|
||||||
|
BlockBasedTableOptions table_options;
|
||||||
|
table_options.filter_policy.reset(NewBloomFilterPolicy(16));
|
||||||
|
table_options.block_size = 100;
|
||||||
Options options = CurrentOptions();
|
Options options = CurrentOptions();
|
||||||
options.write_buffer_size = 1024 * 1024;
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
||||||
|
options.write_buffer_size = 24 * 1024;
|
||||||
options.compression = kNoCompression;
|
options.compression = kNoCompression;
|
||||||
options.create_if_missing = true;
|
options.create_if_missing = true;
|
||||||
options.target_file_size_base = 1024 * 1024;
|
options.target_file_size_base = 24 * 1024;
|
||||||
DestroyAndReopen(options);
|
DestroyAndReopen(options);
|
||||||
const auto default_cf = db_->DefaultColumnFamily();
|
const auto default_cf = db_->DefaultColumnFamily();
|
||||||
|
|
||||||
const int N = 64000;
|
const int N = 64000;
|
||||||
Random rnd(301);
|
Random rnd(301);
|
||||||
for (int i = 0; i < N; i++) {
|
for (int i = 0; i < N; i++) {
|
||||||
ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024)));
|
ASSERT_OK(Put(Key(i), RandomString(&rnd, 24)));
|
||||||
}
|
}
|
||||||
// Flush everything to files
|
// Flush everything to files
|
||||||
Flush();
|
Flush();
|
||||||
|
@ -1509,7 +1515,7 @@ TEST_F(DBTest, ApproximateSizesFilesWithErrorMargin) {
|
||||||
|
|
||||||
// Write more keys
|
// Write more keys
|
||||||
for (int i = N; i < (N + N / 4); i++) {
|
for (int i = N; i < (N + N / 4); i++) {
|
||||||
ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024)));
|
ASSERT_OK(Put(Key(i), RandomString(&rnd, 24)));
|
||||||
}
|
}
|
||||||
// Flush everything to files again
|
// Flush everything to files again
|
||||||
Flush();
|
Flush();
|
||||||
|
@ -1517,27 +1523,45 @@ TEST_F(DBTest, ApproximateSizesFilesWithErrorMargin) {
|
||||||
// Wait for compaction to finish
|
// Wait for compaction to finish
|
||||||
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
||||||
|
|
||||||
const std::string start = Key(0);
|
{
|
||||||
const std::string end = Key(2 * N);
|
const std::string start = Key(0);
|
||||||
const Range r(start, end);
|
const std::string end = Key(2 * N);
|
||||||
|
const Range r(start, end);
|
||||||
|
|
||||||
SizeApproximationOptions size_approx_options;
|
SizeApproximationOptions size_approx_options;
|
||||||
size_approx_options.include_memtabtles = false;
|
size_approx_options.include_memtabtles = false;
|
||||||
size_approx_options.include_files = true;
|
size_approx_options.include_files = true;
|
||||||
size_approx_options.files_size_error_margin = -1.0; // disabled
|
size_approx_options.files_size_error_margin = -1.0; // disabled
|
||||||
|
|
||||||
// Get the precise size without any approximation heuristic
|
// Get the precise size without any approximation heuristic
|
||||||
uint64_t size;
|
uint64_t size;
|
||||||
db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
|
db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
|
||||||
ASSERT_NE(size, 0);
|
ASSERT_NE(size, 0);
|
||||||
|
|
||||||
// Get the size with an approximation heuristic
|
// Get the size with an approximation heuristic
|
||||||
uint64_t size2;
|
uint64_t size2;
|
||||||
const double error_margin = 0.2;
|
const double error_margin = 0.2;
|
||||||
size_approx_options.files_size_error_margin = error_margin;
|
size_approx_options.files_size_error_margin = error_margin;
|
||||||
db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size2);
|
db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size2);
|
||||||
ASSERT_LT(size2, size * (1 + error_margin));
|
ASSERT_LT(size2, size * (1 + error_margin));
|
||||||
ASSERT_GT(size2, size * (1 - error_margin));
|
ASSERT_GT(size2, size * (1 - error_margin));
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
// Ensure that metadata is not falsely attributed only to the last data in
|
||||||
|
// the file. (In some applications, filters can be large portion of data
|
||||||
|
// size.)
|
||||||
|
// Perform many queries over small range, enough to ensure crossing file
|
||||||
|
// boundary, and make sure we never see a spike for large filter.
|
||||||
|
for (int i = 0; i < 3000; i += 10) {
|
||||||
|
const std::string start = Key(i);
|
||||||
|
const std::string end = Key(i + 11); // overlap by 1 key
|
||||||
|
const Range r(start, end);
|
||||||
|
uint64_t size;
|
||||||
|
db_->GetApproximateSizes(&r, 1, &size);
|
||||||
|
ASSERT_LE(size, 11 * 100);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_F(DBTest, GetApproximateMemTableStats) {
|
TEST_F(DBTest, GetApproximateMemTableStats) {
|
||||||
|
@ -1675,12 +1699,13 @@ TEST_F(DBTest, ApproximateSizes_MixOfSmallAndLarge) {
|
||||||
ASSERT_TRUE(Between(Size("", Key(2), 1), 20000, 21000));
|
ASSERT_TRUE(Between(Size("", Key(2), 1), 20000, 21000));
|
||||||
ASSERT_TRUE(Between(Size("", Key(3), 1), 120000, 121000));
|
ASSERT_TRUE(Between(Size("", Key(3), 1), 120000, 121000));
|
||||||
ASSERT_TRUE(Between(Size("", Key(4), 1), 130000, 131000));
|
ASSERT_TRUE(Between(Size("", Key(4), 1), 130000, 131000));
|
||||||
ASSERT_TRUE(Between(Size("", Key(5), 1), 230000, 231000));
|
ASSERT_TRUE(Between(Size("", Key(5), 1), 230000, 232000));
|
||||||
ASSERT_TRUE(Between(Size("", Key(6), 1), 240000, 241000));
|
ASSERT_TRUE(Between(Size("", Key(6), 1), 240000, 242000));
|
||||||
ASSERT_TRUE(Between(Size("", Key(7), 1), 540000, 541000));
|
// Ensure some overhead is accounted for, even without including all
|
||||||
ASSERT_TRUE(Between(Size("", Key(8), 1), 550000, 560000));
|
ASSERT_TRUE(Between(Size("", Key(7), 1), 540500, 545000));
|
||||||
|
ASSERT_TRUE(Between(Size("", Key(8), 1), 550500, 555000));
|
||||||
|
|
||||||
ASSERT_TRUE(Between(Size(Key(3), Key(5), 1), 110000, 111000));
|
ASSERT_TRUE(Between(Size(Key(3), Key(5), 1), 110100, 111000));
|
||||||
|
|
||||||
dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
|
dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
|
||||||
}
|
}
|
||||||
|
|
|
@ -5489,7 +5489,8 @@ uint64_t VersionSet::ApproximateSize(const SizeApproximationOptions& options,
|
||||||
static_cast<uint64_t>(total_full_size * margin)) {
|
static_cast<uint64_t>(total_full_size * margin)) {
|
||||||
total_full_size += total_intersecting_size / 2;
|
total_full_size += total_intersecting_size / 2;
|
||||||
} else {
|
} else {
|
||||||
// Estimate for all the first files, at each level
|
// Estimate for all the first files (might also be last files), at each
|
||||||
|
// level
|
||||||
for (const auto file_ptr : first_files) {
|
for (const auto file_ptr : first_files) {
|
||||||
total_full_size += ApproximateSize(v, *file_ptr, start, end, caller);
|
total_full_size += ApproximateSize(v, *file_ptr, start, end, caller);
|
||||||
}
|
}
|
||||||
|
|
|
@ -1004,32 +1004,33 @@ class DB {
|
||||||
};
|
};
|
||||||
|
|
||||||
// For each i in [0,n-1], store in "sizes[i]", the approximate
|
// For each i in [0,n-1], store in "sizes[i]", the approximate
|
||||||
// file system space used by keys in "[range[i].start .. range[i].limit)".
|
// file system space used by keys in "[range[i].start .. range[i].limit)"
|
||||||
|
// in a single column family.
|
||||||
//
|
//
|
||||||
// Note that the returned sizes measure file system space usage, so
|
// Note that the returned sizes measure file system space usage, so
|
||||||
// if the user data compresses by a factor of ten, the returned
|
// if the user data compresses by a factor of ten, the returned
|
||||||
// sizes will be one-tenth the size of the corresponding user data size.
|
// sizes will be one-tenth the size of the corresponding user data size.
|
||||||
virtual Status GetApproximateSizes(const SizeApproximationOptions& options,
|
virtual Status GetApproximateSizes(const SizeApproximationOptions& options,
|
||||||
ColumnFamilyHandle* column_family,
|
ColumnFamilyHandle* column_family,
|
||||||
const Range* range, int n,
|
const Range* ranges, int n,
|
||||||
uint64_t* sizes) = 0;
|
uint64_t* sizes) = 0;
|
||||||
|
|
||||||
// Simpler versions of the GetApproximateSizes() method above.
|
// Simpler versions of the GetApproximateSizes() method above.
|
||||||
// The include_flags argumenbt must of type DB::SizeApproximationFlags
|
// The include_flags argumenbt must of type DB::SizeApproximationFlags
|
||||||
// and can not be NONE.
|
// and can not be NONE.
|
||||||
virtual void GetApproximateSizes(ColumnFamilyHandle* column_family,
|
virtual void GetApproximateSizes(ColumnFamilyHandle* column_family,
|
||||||
const Range* range, int n, uint64_t* sizes,
|
const Range* ranges, int n, uint64_t* sizes,
|
||||||
uint8_t include_flags = INCLUDE_FILES) {
|
uint8_t include_flags = INCLUDE_FILES) {
|
||||||
SizeApproximationOptions options;
|
SizeApproximationOptions options;
|
||||||
options.include_memtabtles =
|
options.include_memtabtles =
|
||||||
(include_flags & SizeApproximationFlags::INCLUDE_MEMTABLES) != 0;
|
(include_flags & SizeApproximationFlags::INCLUDE_MEMTABLES) != 0;
|
||||||
options.include_files =
|
options.include_files =
|
||||||
(include_flags & SizeApproximationFlags::INCLUDE_FILES) != 0;
|
(include_flags & SizeApproximationFlags::INCLUDE_FILES) != 0;
|
||||||
GetApproximateSizes(options, column_family, range, n, sizes);
|
GetApproximateSizes(options, column_family, ranges, n, sizes);
|
||||||
}
|
}
|
||||||
virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes,
|
virtual void GetApproximateSizes(const Range* ranges, int n, uint64_t* sizes,
|
||||||
uint8_t include_flags = INCLUDE_FILES) {
|
uint8_t include_flags = INCLUDE_FILES) {
|
||||||
GetApproximateSizes(DefaultColumnFamily(), range, n, sizes, include_flags);
|
GetApproximateSizes(DefaultColumnFamily(), ranges, n, sizes, include_flags);
|
||||||
}
|
}
|
||||||
|
|
||||||
// The method is similar to GetApproximateSizes, except it
|
// The method is similar to GetApproximateSizes, except it
|
||||||
|
|
|
@ -656,8 +656,8 @@ Status BlockBasedTable::Open(
|
||||||
// access a dangling pointer.
|
// access a dangling pointer.
|
||||||
BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch};
|
BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch};
|
||||||
Rep* rep = new BlockBasedTable::Rep(ioptions, env_options, table_options,
|
Rep* rep = new BlockBasedTable::Rep(ioptions, env_options, table_options,
|
||||||
internal_comparator, skip_filters, level,
|
internal_comparator, skip_filters,
|
||||||
immortal_table);
|
file_size, level, immortal_table);
|
||||||
rep->file = std::move(file);
|
rep->file = std::move(file);
|
||||||
rep->footer = footer;
|
rep->footer = footer;
|
||||||
rep->hash_index_allow_collision = table_options.hash_index_allow_collision;
|
rep->hash_index_allow_collision = table_options.hash_index_allow_collision;
|
||||||
|
@ -3012,30 +3012,37 @@ Status BlockBasedTable::CreateIndexReader(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
uint64_t BlockBasedTable::ApproximateOffsetOf(
|
uint64_t BlockBasedTable::ApproximateDataOffsetOf(
|
||||||
const InternalIteratorBase<IndexValue>& index_iter) const {
|
const InternalIteratorBase<IndexValue>& index_iter,
|
||||||
uint64_t result = 0;
|
uint64_t data_size) const {
|
||||||
if (index_iter.Valid()) {
|
if (index_iter.Valid()) {
|
||||||
BlockHandle handle = index_iter.value().handle;
|
BlockHandle handle = index_iter.value().handle;
|
||||||
result = handle.offset();
|
return handle.offset();
|
||||||
} else {
|
} else {
|
||||||
// The iterator is past the last key in the file. If table_properties is not
|
// The iterator is past the last key in the file.
|
||||||
// available, approximate the offset by returning the offset of the
|
return data_size;
|
||||||
// metaindex block (which is right near the end of the file).
|
|
||||||
if (rep_->table_properties) {
|
|
||||||
result = rep_->table_properties->data_size;
|
|
||||||
}
|
|
||||||
// table_properties is not present in the table.
|
|
||||||
if (result == 0) {
|
|
||||||
result = rep_->footer.metaindex_handle().offset();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return result;
|
uint64_t BlockBasedTable::GetApproximateDataSize() {
|
||||||
|
// Should be in table properties unless super old version
|
||||||
|
if (rep_->table_properties) {
|
||||||
|
return rep_->table_properties->data_size;
|
||||||
|
}
|
||||||
|
// Fall back to rough estimate from footer
|
||||||
|
return rep_->footer.metaindex_handle().offset();
|
||||||
}
|
}
|
||||||
|
|
||||||
uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key,
|
uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key,
|
||||||
TableReaderCaller caller) {
|
TableReaderCaller caller) {
|
||||||
|
uint64_t data_size = GetApproximateDataSize();
|
||||||
|
if (UNLIKELY(data_size == 0)) {
|
||||||
|
// Hmm. Let's just split in half to avoid skewing one way or another,
|
||||||
|
// since we don't know whether we're operating on lower bound or
|
||||||
|
// upper bound.
|
||||||
|
return rep_->file_size / 2;
|
||||||
|
}
|
||||||
|
|
||||||
BlockCacheLookupContext context(caller);
|
BlockCacheLookupContext context(caller);
|
||||||
IndexBlockIter iiter_on_stack;
|
IndexBlockIter iiter_on_stack;
|
||||||
ReadOptions ro;
|
ReadOptions ro;
|
||||||
|
@ -3050,13 +3057,27 @@ uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key,
|
||||||
}
|
}
|
||||||
|
|
||||||
index_iter->Seek(key);
|
index_iter->Seek(key);
|
||||||
return ApproximateOffsetOf(*index_iter);
|
|
||||||
|
uint64_t offset = ApproximateDataOffsetOf(*index_iter, data_size);
|
||||||
|
// Pro-rate file metadata (incl filters) size-proportionally across data
|
||||||
|
// blocks.
|
||||||
|
double size_ratio =
|
||||||
|
static_cast<double>(offset) / static_cast<double>(data_size);
|
||||||
|
return static_cast<uint64_t>(size_ratio *
|
||||||
|
static_cast<double>(rep_->file_size));
|
||||||
}
|
}
|
||||||
|
|
||||||
uint64_t BlockBasedTable::ApproximateSize(const Slice& start, const Slice& end,
|
uint64_t BlockBasedTable::ApproximateSize(const Slice& start, const Slice& end,
|
||||||
TableReaderCaller caller) {
|
TableReaderCaller caller) {
|
||||||
assert(rep_->internal_comparator.Compare(start, end) <= 0);
|
assert(rep_->internal_comparator.Compare(start, end) <= 0);
|
||||||
|
|
||||||
|
uint64_t data_size = GetApproximateDataSize();
|
||||||
|
if (UNLIKELY(data_size == 0)) {
|
||||||
|
// Hmm. Assume whole file is involved, since we have lower and upper
|
||||||
|
// bound.
|
||||||
|
return rep_->file_size;
|
||||||
|
}
|
||||||
|
|
||||||
BlockCacheLookupContext context(caller);
|
BlockCacheLookupContext context(caller);
|
||||||
IndexBlockIter iiter_on_stack;
|
IndexBlockIter iiter_on_stack;
|
||||||
ReadOptions ro;
|
ReadOptions ro;
|
||||||
|
@ -3071,12 +3092,17 @@ uint64_t BlockBasedTable::ApproximateSize(const Slice& start, const Slice& end,
|
||||||
}
|
}
|
||||||
|
|
||||||
index_iter->Seek(start);
|
index_iter->Seek(start);
|
||||||
uint64_t start_offset = ApproximateOffsetOf(*index_iter);
|
uint64_t start_offset = ApproximateDataOffsetOf(*index_iter, data_size);
|
||||||
index_iter->Seek(end);
|
index_iter->Seek(end);
|
||||||
uint64_t end_offset = ApproximateOffsetOf(*index_iter);
|
uint64_t end_offset = ApproximateDataOffsetOf(*index_iter, data_size);
|
||||||
|
|
||||||
assert(end_offset >= start_offset);
|
assert(end_offset >= start_offset);
|
||||||
return end_offset - start_offset;
|
// Pro-rate file metadata (incl filters) size-proportionally across data
|
||||||
|
// blocks.
|
||||||
|
double size_ratio = static_cast<double>(end_offset - start_offset) /
|
||||||
|
static_cast<double>(data_size);
|
||||||
|
return static_cast<uint64_t>(size_ratio *
|
||||||
|
static_cast<double>(rep_->file_size));
|
||||||
}
|
}
|
||||||
|
|
||||||
bool BlockBasedTable::TEST_FilterBlockInCache() const {
|
bool BlockBasedTable::TEST_FilterBlockInCache() const {
|
||||||
|
|
|
@ -441,9 +441,13 @@ class BlockBasedTable : public TableReader {
|
||||||
static void GenerateCachePrefix(Cache* cc, FSWritableFile* file, char* buffer,
|
static void GenerateCachePrefix(Cache* cc, FSWritableFile* file, char* buffer,
|
||||||
size_t* size);
|
size_t* size);
|
||||||
|
|
||||||
// Given an iterator return its offset in file.
|
// Size of all data blocks, maybe approximate
|
||||||
uint64_t ApproximateOffsetOf(
|
uint64_t GetApproximateDataSize();
|
||||||
const InternalIteratorBase<IndexValue>& index_iter) const;
|
|
||||||
|
// Given an iterator return its offset in data block section of file.
|
||||||
|
uint64_t ApproximateDataOffsetOf(
|
||||||
|
const InternalIteratorBase<IndexValue>& index_iter,
|
||||||
|
uint64_t data_size) const;
|
||||||
|
|
||||||
// Helper functions for DumpTable()
|
// Helper functions for DumpTable()
|
||||||
Status DumpIndexBlock(WritableFile* out_file);
|
Status DumpIndexBlock(WritableFile* out_file);
|
||||||
|
@ -482,7 +486,7 @@ struct BlockBasedTable::Rep {
|
||||||
Rep(const ImmutableCFOptions& _ioptions, const EnvOptions& _env_options,
|
Rep(const ImmutableCFOptions& _ioptions, const EnvOptions& _env_options,
|
||||||
const BlockBasedTableOptions& _table_opt,
|
const BlockBasedTableOptions& _table_opt,
|
||||||
const InternalKeyComparator& _internal_comparator, bool skip_filters,
|
const InternalKeyComparator& _internal_comparator, bool skip_filters,
|
||||||
int _level, const bool _immortal_table)
|
uint64_t _file_size, int _level, const bool _immortal_table)
|
||||||
: ioptions(_ioptions),
|
: ioptions(_ioptions),
|
||||||
env_options(_env_options),
|
env_options(_env_options),
|
||||||
table_options(_table_opt),
|
table_options(_table_opt),
|
||||||
|
@ -494,6 +498,7 @@ struct BlockBasedTable::Rep {
|
||||||
whole_key_filtering(_table_opt.whole_key_filtering),
|
whole_key_filtering(_table_opt.whole_key_filtering),
|
||||||
prefix_filtering(true),
|
prefix_filtering(true),
|
||||||
global_seqno(kDisableGlobalSequenceNumber),
|
global_seqno(kDisableGlobalSequenceNumber),
|
||||||
|
file_size(_file_size),
|
||||||
level(_level),
|
level(_level),
|
||||||
immortal_table(_immortal_table) {}
|
immortal_table(_immortal_table) {}
|
||||||
|
|
||||||
|
@ -551,6 +556,9 @@ struct BlockBasedTable::Rep {
|
||||||
// and every key have it's own seqno.
|
// and every key have it's own seqno.
|
||||||
SequenceNumber global_seqno;
|
SequenceNumber global_seqno;
|
||||||
|
|
||||||
|
// Size of the table file on disk
|
||||||
|
uint64_t file_size;
|
||||||
|
|
||||||
// the level when the table is opened, could potentially change when trivial
|
// the level when the table is opened, could potentially change when trivial
|
||||||
// move is involved
|
// move is involved
|
||||||
int level;
|
int level;
|
||||||
|
|
|
@ -39,7 +39,7 @@ class MockBlockBasedTableTester {
|
||||||
constexpr bool immortal_table = false;
|
constexpr bool immortal_table = false;
|
||||||
table_.reset(new MockBlockBasedTable(new BlockBasedTable::Rep(
|
table_.reset(new MockBlockBasedTable(new BlockBasedTable::Rep(
|
||||||
ioptions_, env_options_, table_options_, icomp_, skip_filters,
|
ioptions_, env_options_, table_options_, icomp_, skip_filters,
|
||||||
kMockLevel, immortal_table)));
|
12345 /*file_size*/, kMockLevel, immortal_table)));
|
||||||
}
|
}
|
||||||
|
|
||||||
FilterBitsBuilder* GetBuilder() const {
|
FilterBitsBuilder* GetBuilder() const {
|
||||||
|
|
|
@ -143,11 +143,13 @@ class PartitionedFilterBlockTest
|
||||||
} while (status.IsIncomplete());
|
} while (status.IsIncomplete());
|
||||||
|
|
||||||
constexpr bool skip_filters = false;
|
constexpr bool skip_filters = false;
|
||||||
|
constexpr uint64_t file_size = 12345;
|
||||||
constexpr int level = 0;
|
constexpr int level = 0;
|
||||||
constexpr bool immortal_table = false;
|
constexpr bool immortal_table = false;
|
||||||
table_.reset(new MockedBlockBasedTable(
|
table_.reset(new MockedBlockBasedTable(
|
||||||
new BlockBasedTable::Rep(ioptions_, env_options_, table_options_,
|
new BlockBasedTable::Rep(ioptions_, env_options_, table_options_,
|
||||||
icomp_, skip_filters, level, immortal_table),
|
icomp_, skip_filters, file_size, level,
|
||||||
|
immortal_table),
|
||||||
pib));
|
pib));
|
||||||
BlockContents contents(slice);
|
BlockContents contents(slice);
|
||||||
CachableEntry<Block> block(
|
CachableEntry<Block> block(
|
||||||
|
|
|
@ -64,12 +64,19 @@ class TableReader {
|
||||||
// bytes, and so includes effects like compression of the underlying data.
|
// bytes, and so includes effects like compression of the underlying data.
|
||||||
// E.g., the approximate offset of the last key in the table will
|
// E.g., the approximate offset of the last key in the table will
|
||||||
// be close to the file length.
|
// be close to the file length.
|
||||||
|
// TODO(peterd): Since this function is only used for approximate size
|
||||||
|
// from beginning of file, reduce code duplication by removing this
|
||||||
|
// function and letting ApproximateSize take optional start and end, so
|
||||||
|
// that absolute start and end can be specified and optimized without
|
||||||
|
// key / index work.
|
||||||
virtual uint64_t ApproximateOffsetOf(const Slice& key,
|
virtual uint64_t ApproximateOffsetOf(const Slice& key,
|
||||||
TableReaderCaller caller) = 0;
|
TableReaderCaller caller) = 0;
|
||||||
|
|
||||||
// Given start and end keys, return the approximate data size in the file
|
// Given start and end keys, return the approximate data size in the file
|
||||||
// between the keys. The returned value is in terms of file bytes, and so
|
// between the keys. The returned value is in terms of file bytes, and so
|
||||||
// includes effects like compression of the underlying data.
|
// includes effects like compression of the underlying data and applicable
|
||||||
|
// portions of metadata including filters and indexes. Nullptr for start or
|
||||||
|
// end (or both) indicates absolute start or end of the table.
|
||||||
virtual uint64_t ApproximateSize(const Slice& start, const Slice& end,
|
virtual uint64_t ApproximateSize(const Slice& start, const Slice& end,
|
||||||
TableReaderCaller caller) = 0;
|
TableReaderCaller caller) = 0;
|
||||||
|
|
||||||
|
|
|
@ -3557,12 +3557,12 @@ static void DoCompressionTest(CompressionType comp) {
|
||||||
const MutableCFOptions moptions(options);
|
const MutableCFOptions moptions(options);
|
||||||
c.Finish(options, ioptions, moptions, table_options, ikc, &keys, &kvmap);
|
c.Finish(options, ioptions, moptions, table_options, ikc, &keys, &kvmap);
|
||||||
|
|
||||||
ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, 0));
|
ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, 0));
|
||||||
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, 0));
|
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, 0));
|
||||||
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"), 0, 0));
|
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"), 0, 0));
|
||||||
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), 2000, 3500));
|
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), 2000, 3500));
|
||||||
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), 2000, 3500));
|
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), 2000, 3500));
|
||||||
ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 4000, 6500));
|
ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 4000, 7000));
|
||||||
c.ResetTableReader();
|
c.ResetTableReader();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -115,6 +115,7 @@ DEFINE_string(
|
||||||
"readrandomwriterandom,"
|
"readrandomwriterandom,"
|
||||||
"updaterandom,"
|
"updaterandom,"
|
||||||
"xorupdaterandom,"
|
"xorupdaterandom,"
|
||||||
|
"approximatesizerandom,"
|
||||||
"randomwithverify,"
|
"randomwithverify,"
|
||||||
"fill100K,"
|
"fill100K,"
|
||||||
"crc32c,"
|
"crc32c,"
|
||||||
|
@ -3021,6 +3022,10 @@ class Benchmark {
|
||||||
fprintf(stderr, "entries_per_batch = %" PRIi64 "\n",
|
fprintf(stderr, "entries_per_batch = %" PRIi64 "\n",
|
||||||
entries_per_batch_);
|
entries_per_batch_);
|
||||||
method = &Benchmark::MultiReadRandom;
|
method = &Benchmark::MultiReadRandom;
|
||||||
|
} else if (name == "approximatesizerandom") {
|
||||||
|
fprintf(stderr, "entries_per_batch = %" PRIi64 "\n",
|
||||||
|
entries_per_batch_);
|
||||||
|
method = &Benchmark::ApproximateSizeRandom;
|
||||||
} else if (name == "mixgraph") {
|
} else if (name == "mixgraph") {
|
||||||
method = &Benchmark::MixGraph;
|
method = &Benchmark::MixGraph;
|
||||||
} else if (name == "readmissing") {
|
} else if (name == "readmissing") {
|
||||||
|
@ -5211,6 +5216,53 @@ class Benchmark {
|
||||||
thread->stats.AddMessage(msg);
|
thread->stats.AddMessage(msg);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Calls ApproximateSize over random key ranges.
|
||||||
|
void ApproximateSizeRandom(ThreadState* thread) {
|
||||||
|
int64_t size_sum = 0;
|
||||||
|
int64_t num_sizes = 0;
|
||||||
|
const size_t batch_size = entries_per_batch_;
|
||||||
|
std::vector<Range> ranges;
|
||||||
|
std::vector<Slice> lkeys;
|
||||||
|
std::vector<std::unique_ptr<const char[]>> lkey_guards;
|
||||||
|
std::vector<Slice> rkeys;
|
||||||
|
std::vector<std::unique_ptr<const char[]>> rkey_guards;
|
||||||
|
std::vector<uint64_t> sizes;
|
||||||
|
while (ranges.size() < batch_size) {
|
||||||
|
// Ugly without C++17 return from emplace_back
|
||||||
|
lkey_guards.emplace_back();
|
||||||
|
rkey_guards.emplace_back();
|
||||||
|
lkeys.emplace_back(AllocateKey(&lkey_guards.back()));
|
||||||
|
rkeys.emplace_back(AllocateKey(&rkey_guards.back()));
|
||||||
|
ranges.emplace_back(lkeys.back(), rkeys.back());
|
||||||
|
sizes.push_back(0);
|
||||||
|
}
|
||||||
|
Duration duration(FLAGS_duration, reads_);
|
||||||
|
while (!duration.Done(1)) {
|
||||||
|
DB* db = SelectDB(thread);
|
||||||
|
for (size_t i = 0; i < batch_size; ++i) {
|
||||||
|
int64_t lkey = GetRandomKey(&thread->rand);
|
||||||
|
int64_t rkey = GetRandomKey(&thread->rand);
|
||||||
|
if (lkey > rkey) {
|
||||||
|
std::swap(lkey, rkey);
|
||||||
|
}
|
||||||
|
GenerateKeyFromInt(lkey, FLAGS_num, &lkeys[i]);
|
||||||
|
GenerateKeyFromInt(rkey, FLAGS_num, &rkeys[i]);
|
||||||
|
}
|
||||||
|
db->GetApproximateSizes(&ranges[0], static_cast<int>(entries_per_batch_),
|
||||||
|
&sizes[0]);
|
||||||
|
num_sizes += entries_per_batch_;
|
||||||
|
for (int64_t size : sizes) {
|
||||||
|
size_sum += size;
|
||||||
|
}
|
||||||
|
thread->stats.FinishedOps(nullptr, db, entries_per_batch_, kOthers);
|
||||||
|
}
|
||||||
|
|
||||||
|
char msg[100];
|
||||||
|
snprintf(msg, sizeof(msg), "(Avg approx size=%g)",
|
||||||
|
static_cast<double>(size_sum) / static_cast<double>(num_sizes));
|
||||||
|
thread->stats.AddMessage(msg);
|
||||||
|
}
|
||||||
|
|
||||||
// The inverse function of Pareto distribution
|
// The inverse function of Pareto distribution
|
||||||
int64_t ParetoCdfInversion(double u, double theta, double k, double sigma) {
|
int64_t ParetoCdfInversion(double u, double theta, double k, double sigma) {
|
||||||
double ret;
|
double ret;
|
||||||
|
|
Loading…
Reference in New Issue