mirror of https://github.com/facebook/rocksdb.git
Fix the handling of wide-column base values in the max_successive_merges logic (#11913)
Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/11913 The `max_successive_merges` logic currently does not handle wide-column base values correctly, since it uses the `Get` API, which only returns the value of the default column. The patch fixes this by switching to `GetEntity` and passing all columns (if applicable) to the merge operator. Reviewed By: jaykorean Differential Revision: D49795097 fbshipit-source-id: 75eb7cc9476226255062cdb3d43ab6bd1cc2faa3
This commit is contained in:
parent
7bebd3036d
commit
b00fa5597e
|
@ -6,10 +6,12 @@
|
|||
#include <vector>
|
||||
|
||||
#include "db/db_test_util.h"
|
||||
#include "db/dbformat.h"
|
||||
#include "db/forward_iterator.h"
|
||||
#include "port/stack_trace.h"
|
||||
#include "rocksdb/merge_operator.h"
|
||||
#include "rocksdb/snapshot.h"
|
||||
#include "rocksdb/utilities/debug.h"
|
||||
#include "util/random.h"
|
||||
#include "utilities/merge_operators.h"
|
||||
#include "utilities/merge_operators/string_append/stringappend2.h"
|
||||
|
@ -949,6 +951,98 @@ TEST_P(PerConfigMergeOperatorPinningTest, Randomized) {
|
|||
VerifyDBFromMap(true_data);
|
||||
}
|
||||
|
||||
TEST_F(DBMergeOperatorTest, MaxSuccessiveMergesBaseValues) {
|
||||
Options options = CurrentOptions();
|
||||
options.create_if_missing = true;
|
||||
options.merge_operator = MergeOperators::CreatePutOperator();
|
||||
options.max_successive_merges = 1;
|
||||
options.env = env_;
|
||||
Reopen(options);
|
||||
|
||||
constexpr char foo[] = "foo";
|
||||
constexpr char bar[] = "bar";
|
||||
constexpr char baz[] = "baz";
|
||||
constexpr char qux[] = "qux";
|
||||
constexpr char corge[] = "corge";
|
||||
|
||||
// No base value
|
||||
{
|
||||
constexpr char key[] = "key1";
|
||||
|
||||
ASSERT_OK(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), key, foo));
|
||||
ASSERT_OK(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), key, bar));
|
||||
|
||||
PinnableSlice result;
|
||||
ASSERT_OK(
|
||||
db_->Get(ReadOptions(), db_->DefaultColumnFamily(), key, &result));
|
||||
ASSERT_EQ(result, bar);
|
||||
|
||||
// We expect the second Merge to be converted to a Put because of
|
||||
// max_successive_merges.
|
||||
constexpr size_t max_key_versions = 8;
|
||||
std::vector<KeyVersion> key_versions;
|
||||
ASSERT_OK(GetAllKeyVersions(db_, db_->DefaultColumnFamily(), key, key,
|
||||
max_key_versions, &key_versions));
|
||||
ASSERT_EQ(key_versions.size(), 2);
|
||||
ASSERT_EQ(key_versions[0].type, kTypeValue);
|
||||
ASSERT_EQ(key_versions[1].type, kTypeMerge);
|
||||
}
|
||||
|
||||
// Plain base value
|
||||
{
|
||||
constexpr char key[] = "key2";
|
||||
|
||||
ASSERT_OK(db_->Put(WriteOptions(), db_->DefaultColumnFamily(), key, foo));
|
||||
ASSERT_OK(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), key, bar));
|
||||
ASSERT_OK(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), key, baz));
|
||||
|
||||
PinnableSlice result;
|
||||
ASSERT_OK(
|
||||
db_->Get(ReadOptions(), db_->DefaultColumnFamily(), key, &result));
|
||||
ASSERT_EQ(result, baz);
|
||||
|
||||
// We expect the second Merge to be converted to a Put because of
|
||||
// max_successive_merges.
|
||||
constexpr size_t max_key_versions = 8;
|
||||
std::vector<KeyVersion> key_versions;
|
||||
ASSERT_OK(GetAllKeyVersions(db_, db_->DefaultColumnFamily(), key, key,
|
||||
max_key_versions, &key_versions));
|
||||
ASSERT_EQ(key_versions.size(), 3);
|
||||
ASSERT_EQ(key_versions[0].type, kTypeValue);
|
||||
ASSERT_EQ(key_versions[1].type, kTypeMerge);
|
||||
ASSERT_EQ(key_versions[2].type, kTypeValue);
|
||||
}
|
||||
|
||||
// Wide-column base value
|
||||
{
|
||||
constexpr char key[] = "key3";
|
||||
const WideColumns columns{{kDefaultWideColumnName, foo}, {bar, baz}};
|
||||
|
||||
ASSERT_OK(db_->PutEntity(WriteOptions(), db_->DefaultColumnFamily(), key,
|
||||
columns));
|
||||
ASSERT_OK(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), key, qux));
|
||||
ASSERT_OK(
|
||||
db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), key, corge));
|
||||
|
||||
PinnableWideColumns result;
|
||||
ASSERT_OK(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(), key,
|
||||
&result));
|
||||
const WideColumns expected{{kDefaultWideColumnName, corge}, {bar, baz}};
|
||||
ASSERT_EQ(result.columns(), expected);
|
||||
|
||||
// We expect the second Merge to be converted to a PutEntity because of
|
||||
// max_successive_merges.
|
||||
constexpr size_t max_key_versions = 8;
|
||||
std::vector<KeyVersion> key_versions;
|
||||
ASSERT_OK(GetAllKeyVersions(db_, db_->DefaultColumnFamily(), key, key,
|
||||
max_key_versions, &key_versions));
|
||||
ASSERT_EQ(key_versions.size(), 3);
|
||||
ASSERT_EQ(key_versions[0].type, kTypeWideColumnEntity);
|
||||
ASSERT_EQ(key_versions[1].type, kTypeMerge);
|
||||
ASSERT_EQ(key_versions[2].type, kTypeWideColumnEntity);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace ROCKSDB_NAMESPACE
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
|
|
|
@ -298,6 +298,20 @@ Status MergeHelper::TimedFullMerge(
|
|||
result_type, op_failure_scope);
|
||||
}
|
||||
|
||||
Status MergeHelper::TimedFullMerge(
|
||||
const MergeOperator* merge_operator, const Slice& key, WideBaseValueTag,
|
||||
const WideColumns& columns, const std::vector<Slice>& operands,
|
||||
Logger* logger, Statistics* statistics, SystemClock* clock,
|
||||
bool update_num_ops_stats, std::string* result, Slice* result_operand,
|
||||
ValueType* result_type, MergeOperator::OpFailureScope* op_failure_scope) {
|
||||
MergeOperator::MergeOperationInputV3::ExistingValue existing_value(columns);
|
||||
|
||||
return TimedFullMergeImpl(merge_operator, key, std::move(existing_value),
|
||||
operands, logger, statistics, clock,
|
||||
update_num_ops_stats, result, result_operand,
|
||||
result_type, op_failure_scope);
|
||||
}
|
||||
|
||||
Status MergeHelper::TimedFullMerge(
|
||||
const MergeOperator* merge_operator, const Slice& key, NoBaseValueTag,
|
||||
const std::vector<Slice>& operands, Logger* logger, Statistics* statistics,
|
||||
|
@ -351,6 +365,21 @@ Status MergeHelper::TimedFullMerge(
|
|||
op_failure_scope);
|
||||
}
|
||||
|
||||
Status MergeHelper::TimedFullMerge(
|
||||
const MergeOperator* merge_operator, const Slice& key, WideBaseValueTag,
|
||||
const WideColumns& columns, const std::vector<Slice>& operands,
|
||||
Logger* logger, Statistics* statistics, SystemClock* clock,
|
||||
bool update_num_ops_stats, std::string* result_value,
|
||||
PinnableWideColumns* result_entity,
|
||||
MergeOperator::OpFailureScope* op_failure_scope) {
|
||||
MergeOperator::MergeOperationInputV3::ExistingValue existing_value(columns);
|
||||
|
||||
return TimedFullMergeImpl(merge_operator, key, std::move(existing_value),
|
||||
operands, logger, statistics, clock,
|
||||
update_num_ops_stats, result_value, result_entity,
|
||||
op_failure_scope);
|
||||
}
|
||||
|
||||
// PRE: iter points to the first merge type entry
|
||||
// POST: iter points to the first entry beyond the merge process (or the end)
|
||||
// keys_, operands_ are updated to reflect the merge result.
|
||||
|
|
|
@ -85,6 +85,13 @@ class MergeHelper {
|
|||
std::string* result, Slice* result_operand, ValueType* result_type,
|
||||
MergeOperator::OpFailureScope* op_failure_scope);
|
||||
|
||||
static Status TimedFullMerge(
|
||||
const MergeOperator* merge_operator, const Slice& key, WideBaseValueTag,
|
||||
const WideColumns& columns, const std::vector<Slice>& operands,
|
||||
Logger* logger, Statistics* statistics, SystemClock* clock,
|
||||
bool update_num_ops_stats, std::string* result, Slice* result_operand,
|
||||
ValueType* result_type, MergeOperator::OpFailureScope* op_failure_scope);
|
||||
|
||||
// Variants that expose the merge result translated to the form requested by
|
||||
// the client. (For example, if the result is a wide-column structure but the
|
||||
// client requested the results in plain-value form, the value of the default
|
||||
|
@ -112,6 +119,16 @@ class MergeHelper {
|
|||
std::string* result_value, PinnableWideColumns* result_entity,
|
||||
MergeOperator::OpFailureScope* op_failure_scope);
|
||||
|
||||
static Status TimedFullMerge(const MergeOperator* merge_operator,
|
||||
const Slice& key, WideBaseValueTag,
|
||||
const WideColumns& columns,
|
||||
const std::vector<Slice>& operands,
|
||||
Logger* logger, Statistics* statistics,
|
||||
SystemClock* clock, bool update_num_ops_stats,
|
||||
std::string* result_value,
|
||||
PinnableWideColumns* result_entity,
|
||||
MergeOperator::OpFailureScope* op_failure_scope);
|
||||
|
||||
// During compaction, merge entries until we hit
|
||||
// - a corrupted key
|
||||
// - a Put/Delete,
|
||||
|
|
|
@ -24,6 +24,11 @@ class WideColumnsHelper {
|
|||
return !columns.empty() && columns.front().name() == kDefaultWideColumnName;
|
||||
}
|
||||
|
||||
static bool HasDefaultColumnOnly(const WideColumns& columns) {
|
||||
return columns.size() == 1 &&
|
||||
columns.front().name() == kDefaultWideColumnName;
|
||||
}
|
||||
|
||||
static const Slice& GetDefaultColumn(const WideColumns& columns) {
|
||||
assert(HasDefaultColumn(columns));
|
||||
return columns.front().value();
|
||||
|
|
|
@ -2483,15 +2483,15 @@ class MemTableInserter : public WriteBatch::Handler {
|
|||
}
|
||||
|
||||
if (perform_merge) {
|
||||
// TODO: support wide-column base values for max_successive_merges
|
||||
|
||||
// 1) Get the existing value
|
||||
std::string get_value;
|
||||
// 1) Get the existing value. Use the wide column APIs to make sure we
|
||||
// don't lose any columns in the process.
|
||||
PinnableWideColumns existing;
|
||||
|
||||
// Pass in the sequence number so that we also include previous merge
|
||||
// operations in the same batch.
|
||||
SnapshotImpl read_from_snapshot;
|
||||
read_from_snapshot.number_ = sequence_;
|
||||
|
||||
// TODO: plumb Env::IOActivity
|
||||
ReadOptions read_options;
|
||||
read_options.snapshot = &read_from_snapshot;
|
||||
|
@ -2500,28 +2500,47 @@ class MemTableInserter : public WriteBatch::Handler {
|
|||
if (cf_handle == nullptr) {
|
||||
cf_handle = db_->DefaultColumnFamily();
|
||||
}
|
||||
Status get_status = db_->Get(read_options, cf_handle, key, &get_value);
|
||||
|
||||
Status get_status =
|
||||
db_->GetEntity(read_options, cf_handle, key, &existing);
|
||||
if (!get_status.ok()) {
|
||||
// Failed to read a key we know exists. Store the delta in memtable.
|
||||
perform_merge = false;
|
||||
} else {
|
||||
Slice get_value_slice = Slice(get_value);
|
||||
|
||||
// 2) Apply this merge
|
||||
auto merge_operator = moptions->merge_operator;
|
||||
assert(merge_operator);
|
||||
|
||||
const auto& columns = existing.columns();
|
||||
|
||||
Status merge_status;
|
||||
std::string new_value;
|
||||
ValueType new_value_type;
|
||||
|
||||
if (WideColumnsHelper::HasDefaultColumnOnly(columns)) {
|
||||
// `op_failure_scope` (an output parameter) is not provided (set to
|
||||
// nullptr) since a failure must be propagated regardless of its value.
|
||||
Status merge_status = MergeHelper::TimedFullMerge(
|
||||
merge_operator, key, MergeHelper::kPlainBaseValue, get_value_slice,
|
||||
// nullptr) since a failure must be propagated regardless of its
|
||||
// value.
|
||||
merge_status = MergeHelper::TimedFullMerge(
|
||||
merge_operator, key, MergeHelper::kPlainBaseValue,
|
||||
WideColumnsHelper::GetDefaultColumn(columns), {value},
|
||||
moptions->info_log, moptions->statistics,
|
||||
SystemClock::Default().get(),
|
||||
/* update_num_ops_stats */ false, &new_value,
|
||||
/* result_operand */ nullptr, &new_value_type,
|
||||
/* op_failure_scope */ nullptr);
|
||||
} else {
|
||||
// `op_failure_scope` (an output parameter) is not provided (set to
|
||||
// nullptr) since a failure must be propagated regardless of its
|
||||
// value.
|
||||
merge_status = MergeHelper::TimedFullMerge(
|
||||
merge_operator, key, MergeHelper::kWideBaseValue, columns,
|
||||
{value}, moptions->info_log, moptions->statistics,
|
||||
SystemClock::Default().get(),
|
||||
/* update_num_ops_stats */ false, &new_value,
|
||||
/* result_operand */ nullptr, &new_value_type,
|
||||
/* op_failure_scope */ nullptr);
|
||||
}
|
||||
|
||||
if (!merge_status.ok()) {
|
||||
// Failed to merge!
|
||||
|
@ -2530,12 +2549,13 @@ class MemTableInserter : public WriteBatch::Handler {
|
|||
} else {
|
||||
// 3) Add value to memtable
|
||||
assert(!concurrent_memtable_writes_);
|
||||
assert(new_value_type == kTypeValue ||
|
||||
new_value_type == kTypeWideColumnEntity);
|
||||
|
||||
if (kv_prot_info != nullptr) {
|
||||
auto merged_kv_prot_info =
|
||||
kv_prot_info->StripC(column_family_id).ProtectS(sequence_);
|
||||
merged_kv_prot_info.UpdateV(value, new_value);
|
||||
assert(new_value_type == kTypeValue ||
|
||||
new_value_type == kTypeWideColumnEntity);
|
||||
merged_kv_prot_info.UpdateO(kTypeMerge, new_value_type);
|
||||
ret_status = mem->Add(sequence_, new_value_type, key, new_value,
|
||||
&merged_kv_prot_info);
|
||||
|
|
|
@ -1100,8 +1100,7 @@ std::string LDBCommand::PrintKeyValueOrWideColumns(
|
|||
const Slice& key, const Slice& value, const WideColumns& wide_columns,
|
||||
bool is_key_hex, bool is_value_hex) {
|
||||
if (wide_columns.empty() ||
|
||||
(wide_columns.size() == 1 &&
|
||||
WideColumnsHelper::HasDefaultColumn(wide_columns))) {
|
||||
WideColumnsHelper::HasDefaultColumnOnly(wide_columns)) {
|
||||
return PrintKeyValue(key.ToString(), value.ToString(), is_key_hex,
|
||||
is_value_hex);
|
||||
}
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
Fixed the handling of wide-column base values in the `max_successive_merges` logic.
|
Loading…
Reference in New Issue