Add table property tracking number of range deletions (#4016)

Summary:
Add a new table property, rocksdb.num.range-deletions, which tracks the
number of range deletions in a block-based table. Range deletions are no
longer counted in rocksdb.num.entries; as discovered in PR #3778, there
are various code paths that implicitly assume that rocksdb.num.entries
counts only true keys, not range deletions.

/cc ajkr nvanbenschoten
Closes https://github.com/facebook/rocksdb/pull/4016

Differential Revision: D8527575

Pulled By: ajkr

fbshipit-source-id: 92e7edbe78fda53756a558013c9fb496e7764fd7
This commit is contained in:
Nikhil Benesch 2018-06-26 20:18:43 -07:00 committed by Facebook Github Bot
parent 408205a36b
commit 17339dc2f3
8 changed files with 69 additions and 19 deletions

View File

@ -3,9 +3,11 @@
### Public API Change
* For users of `Statistics` objects created via `CreateDBStatistics()`, the format of the string returned by its `ToString()` method has changed.
* With LRUCache, when high_pri_pool_ratio > 0, midpoint insertion strategy will be enabled to put low-pri items to the tail of low-pri list (the midpoint) when they first inserted into the cache. This is to make cache entries never get hit age out faster, improving cache efficiency when large background scan presents.
* The "rocksdb.num.entries" table property no longer counts range deletion tombstones as entries.
### New Features
* Changes the format of index blocks by storing the key in their raw form rather than converting them to InternalKey. This saves 8 bytes per index key. The feature is backward compatbile but not forward compatible. It is disabled by default unless format_version 3 or above is used.
* Add a new table property, "rocksdb.num.range-deletions", which counts the number of range deletion tombstones in the table.
* Improve the performance of iterators doing long range scans by using readahead, when using direct IO.
* pin_top_level_index_and_filter (default true) in BlockBasedTableOptions can be used in combination with cache_index_and_filter_blocks to prefetch and pin the top-level index of partitioned index and filter blocks in cache. It has no impact when cache_index_and_filter_blocks is false.

View File

@ -161,7 +161,8 @@ Status BuildTable(
nullptr /* upper_bound */, meta);
// Finish and check for builder errors
bool empty = builder->NumEntries() == 0;
tp = builder->GetTableProperties();
bool empty = builder->NumEntries() == 0 && tp.num_range_deletions == 0;
s = c_iter.status();
if (!s.ok() || empty) {
builder->Abandon();
@ -174,7 +175,7 @@ Status BuildTable(
meta->fd.file_size = file_size;
meta->marked_for_compaction = builder->NeedCompact();
assert(meta->fd.GetFileSize() > 0);
tp = builder->GetTableProperties();
tp = builder->GetTableProperties(); // refresh now that builder is finished
if (table_properties) {
*table_properties = tp;
}

View File

@ -1217,7 +1217,12 @@ Status CompactionJob::FinishCompactionOutputFile(
}
sub_compact->outfile.reset();
if (s.ok() && current_entries == 0) {
TableProperties tp;
if (s.ok()) {
tp = sub_compact->builder->GetTableProperties();
}
if (s.ok() && current_entries == 0 && tp.num_range_deletions == 0) {
// If there is nothing to output, no necessary to generate a sst file.
// This happens when the output level is bottom level, at the same time
// the sub_compact output nothing.
@ -1236,10 +1241,8 @@ Status CompactionJob::FinishCompactionOutputFile(
}
ColumnFamilyData* cfd = sub_compact->compaction->column_family_data();
TableProperties tp;
if (s.ok() && current_entries > 0) {
if (s.ok() && (current_entries > 0 || tp.num_range_deletions > 0)) {
// Output to event logger and fire events.
tp = sub_compact->builder->GetTableProperties();
sub_compact->current_output()->table_properties =
std::make_shared<TableProperties>(tp);
ROCKS_LOG_INFO(db_options_.info_log,

View File

@ -170,6 +170,7 @@ void ResetTableProperties(TableProperties* tp) {
tp->raw_value_size = 0;
tp->num_data_blocks = 0;
tp->num_entries = 0;
tp->num_range_deletions = 0;
}
void ParseTablePropertiesString(std::string tp_string, TableProperties* tp) {
@ -178,15 +179,18 @@ void ParseTablePropertiesString(std::string tp_string, TableProperties* tp) {
std::replace(tp_string.begin(), tp_string.end(), '=', ' ');
ResetTableProperties(tp);
sscanf(tp_string.c_str(),
"# data blocks %" SCNu64 " # entries %" SCNu64 " raw key size %" SCNu64
"# data blocks %" SCNu64 " # entries %" SCNu64
" # range deletions %" SCNu64
" raw key size %" SCNu64
" raw average key size %lf "
" raw value size %" SCNu64
" raw average value size %lf "
" data block size %" SCNu64 " index block size (user-key? %" SCNu64
") %" SCNu64 " filter block size %" SCNu64,
&tp->num_data_blocks, &tp->num_entries, &tp->raw_key_size,
&dummy_double, &tp->raw_value_size, &dummy_double, &tp->data_size,
&tp->index_key_is_user_key, &tp->index_size, &tp->filter_size);
&tp->num_data_blocks, &tp->num_entries, &tp->num_range_deletions,
&tp->raw_key_size, &dummy_double, &tp->raw_value_size, &dummy_double,
&tp->data_size, &tp->index_key_is_user_key, &tp->index_size,
&tp->filter_size);
}
void VerifySimilar(uint64_t a, uint64_t b, double bias) {
@ -217,20 +221,25 @@ void VerifyTableProperties(const TableProperties& base_tp,
ASSERT_EQ(base_tp.raw_key_size, new_tp.raw_key_size);
ASSERT_EQ(base_tp.raw_value_size, new_tp.raw_value_size);
ASSERT_EQ(base_tp.num_entries, new_tp.num_entries);
ASSERT_EQ(base_tp.num_range_deletions, new_tp.num_range_deletions);
}
void GetExpectedTableProperties(TableProperties* expected_tp,
const int kKeySize, const int kValueSize,
const int kKeysPerTable, const int kTableCount,
const int kKeysPerTable,
const int kRangeDeletionsPerTable,
const int kTableCount,
const int kBloomBitsPerKey,
const size_t kBlockSize,
const bool index_key_is_user_key) {
const int kKeyCount = kTableCount * kKeysPerTable;
const int kRangeDeletionCount = kTableCount * kRangeDeletionsPerTable;
const int kAvgSuccessorSize = kKeySize / 5;
const int kEncodingSavePerKey = kKeySize / 4;
expected_tp->raw_key_size = kKeyCount * (kKeySize + 8);
expected_tp->raw_value_size = kKeyCount * kValueSize;
expected_tp->raw_key_size = (kKeyCount + kRangeDeletionCount) * (kKeySize + 8);
expected_tp->raw_value_size = (kKeyCount + kRangeDeletionCount) * kValueSize;
expected_tp->num_entries = kKeyCount;
expected_tp->num_range_deletions = kRangeDeletionCount;
expected_tp->num_data_blocks =
kTableCount *
(kKeysPerTable * (kKeySize - kEncodingSavePerKey + kValueSize)) /
@ -291,6 +300,7 @@ TEST_F(DBPropertiesTest, ValidateSampleNumber) {
TEST_F(DBPropertiesTest, AggregatedTableProperties) {
for (int kTableCount = 40; kTableCount <= 100; kTableCount += 30) {
const int kRangeDeletionsPerTable = 5;
const int kKeysPerTable = 100;
const int kKeySize = 80;
const int kValueSize = 200;
@ -309,12 +319,22 @@ TEST_F(DBPropertiesTest, AggregatedTableProperties) {
DestroyAndReopen(options);
// Hold open a snapshot to prevent range tombstones from being compacted
// away.
ManagedSnapshot snapshot(db_);
Random rnd(5632);
for (int table = 1; table <= kTableCount; ++table) {
for (int i = 0; i < kKeysPerTable; ++i) {
db_->Put(WriteOptions(), RandomString(&rnd, kKeySize),
RandomString(&rnd, kValueSize));
}
for (int i = 0; i < kRangeDeletionsPerTable; i++) {
std::string start = RandomString(&rnd, kKeySize);
std::string end = start;
end.resize(kValueSize);
db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end);
}
db_->Flush(FlushOptions());
}
std::string property;
@ -325,7 +345,8 @@ TEST_F(DBPropertiesTest, AggregatedTableProperties) {
TableProperties expected_tp;
GetExpectedTableProperties(&expected_tp, kKeySize, kValueSize,
kKeysPerTable, kTableCount, kBloomBitsPerKey,
kKeysPerTable, kRangeDeletionsPerTable,
kTableCount, kBloomBitsPerKey,
table_options.block_size, index_key_is_user_key);
VerifyTableProperties(expected_tp, output_tp);
@ -448,6 +469,7 @@ TEST_F(DBPropertiesTest, ReadLatencyHistogramByLevel) {
TEST_F(DBPropertiesTest, AggregatedTablePropertiesAtLevel) {
const int kTableCount = 100;
const int kRangeDeletionsPerTable = 2;
const int kKeysPerTable = 10;
const int kKeySize = 50;
const int kValueSize = 400;
@ -473,6 +495,9 @@ TEST_F(DBPropertiesTest, AggregatedTablePropertiesAtLevel) {
DestroyAndReopen(options);
// Hold open a snapshot to prevent range tombstones from being compacted away.
ManagedSnapshot snapshot(db_);
std::string level_tp_strings[kMaxLevel];
std::string tp_string;
TableProperties level_tps[kMaxLevel];
@ -482,6 +507,12 @@ TEST_F(DBPropertiesTest, AggregatedTablePropertiesAtLevel) {
db_->Put(WriteOptions(), RandomString(&rnd, kKeySize),
RandomString(&rnd, kValueSize));
}
for (int i = 0; i < kRangeDeletionsPerTable; i++) {
std::string start = RandomString(&rnd, kKeySize);
std::string end = start;
end.resize(kValueSize);
db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end);
}
db_->Flush(FlushOptions());
db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
ResetTableProperties(&sum_tp);
@ -497,6 +528,7 @@ TEST_F(DBPropertiesTest, AggregatedTablePropertiesAtLevel) {
sum_tp.raw_value_size += level_tps[level].raw_value_size;
sum_tp.num_data_blocks += level_tps[level].num_data_blocks;
sum_tp.num_entries += level_tps[level].num_entries;
sum_tp.num_range_deletions += level_tps[level].num_range_deletions;
}
db_->GetProperty(DB::Properties::kAggregatedTableProperties, &tp_string);
ParseTablePropertiesString(tp_string, &tp);
@ -508,13 +540,15 @@ TEST_F(DBPropertiesTest, AggregatedTablePropertiesAtLevel) {
ASSERT_EQ(sum_tp.raw_value_size, tp.raw_value_size);
ASSERT_EQ(sum_tp.num_data_blocks, tp.num_data_blocks);
ASSERT_EQ(sum_tp.num_entries, tp.num_entries);
ASSERT_EQ(sum_tp.num_range_deletions, tp.num_range_deletions);
if (table > 3) {
GetExpectedTableProperties(
&expected_tp, kKeySize, kValueSize, kKeysPerTable, table,
kBloomBitsPerKey, table_options.block_size, index_key_is_user_key);
&expected_tp, kKeySize, kValueSize, kKeysPerTable,
kRangeDeletionsPerTable, table, kBloomBitsPerKey,
table_options.block_size, index_key_is_user_key);
// Gives larger bias here as index block size, filter block size,
// and data block size become much harder to estimate in this test.
VerifyTableProperties(tp, expected_tp, 0.5, 0.4, 0.4, 0.25);
VerifyTableProperties(expected_tp, tp, 0.5, 0.4, 0.4, 0.25);
}
}
}

View File

@ -39,6 +39,7 @@ struct TablePropertiesNames {
static const std::string kRawValueSize;
static const std::string kNumDataBlocks;
static const std::string kNumEntries;
static const std::string kNumRangeDeletions;
static const std::string kFormatVersion;
static const std::string kFixedKeyLen;
static const std::string kFilterPolicy;
@ -148,6 +149,8 @@ struct TableProperties {
uint64_t num_data_blocks = 0;
// the number of entries in this table
uint64_t num_entries = 0;
// the number of range deletions in this table
uint64_t num_range_deletions = 0;
// format version, reserved for backward compatibility
uint64_t format_version = 0;
// If 0, key is variable length. Otherwise number of bytes for each key.

View File

@ -444,9 +444,8 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
r->ioptions.info_log);
} else if (value_type == kTypeRangeDeletion) {
// TODO(wanning&andrewkr) add num_tomestone to table properties
r->range_del_block.Add(key, value);
++r->props.num_entries;
++r->props.num_range_deletions;
r->props.raw_key_size += key.size();
r->props.raw_value_size += value.size();
NotifyCollectTableCollectorsOnAdd(key, value, r->offset,

View File

@ -77,6 +77,7 @@ void PropertyBlockBuilder::AddTableProperty(const TableProperties& props) {
}
Add(TablePropertiesNames::kIndexKeyIsUserKey, props.index_key_is_user_key);
Add(TablePropertiesNames::kNumEntries, props.num_entries);
Add(TablePropertiesNames::kNumRangeDeletions, props.num_range_deletions);
Add(TablePropertiesNames::kNumDataBlocks, props.num_data_blocks);
Add(TablePropertiesNames::kFilterSize, props.filter_size);
Add(TablePropertiesNames::kFormatVersion, props.format_version);
@ -224,6 +225,8 @@ Status ReadProperties(const Slice& handle_value, RandomAccessFileReader* file,
{TablePropertiesNames::kNumDataBlocks,
&new_table_properties->num_data_blocks},
{TablePropertiesNames::kNumEntries, &new_table_properties->num_entries},
{TablePropertiesNames::kNumRangeDeletions,
&new_table_properties->num_range_deletions},
{TablePropertiesNames::kFormatVersion,
&new_table_properties->format_version},
{TablePropertiesNames::kFixedKeyLen,

View File

@ -78,6 +78,8 @@ std::string TableProperties::ToString(
AppendProperty(result, "# data blocks", num_data_blocks, prop_delim,
kv_delim);
AppendProperty(result, "# entries", num_entries, prop_delim, kv_delim);
AppendProperty(result, "# range deletions", num_range_deletions, prop_delim,
kv_delim);
AppendProperty(result, "raw key size", raw_key_size, prop_delim, kv_delim);
AppendProperty(result, "raw average key size",
@ -166,6 +168,7 @@ void TableProperties::Add(const TableProperties& tp) {
raw_value_size += tp.raw_value_size;
num_data_blocks += tp.num_data_blocks;
num_entries += tp.num_entries;
num_range_deletions += tp.num_range_deletions;
}
const std::string TablePropertiesNames::kDataSize =
@ -188,6 +191,8 @@ const std::string TablePropertiesNames::kNumDataBlocks =
"rocksdb.num.data.blocks";
const std::string TablePropertiesNames::kNumEntries =
"rocksdb.num.entries";
const std::string TablePropertiesNames::kNumRangeDeletions =
"rocksdb.num.range-deletions";
const std::string TablePropertiesNames::kFilterPolicy =
"rocksdb.filter.policy";
const std::string TablePropertiesNames::kFormatVersion =