Fix a bug in looking up duplicate keys with MultiGet (#6953)

Summary: When MultiGet is called with duplicate keys, and the key matches the largest key in an SST file and the value type is merge, only the first instance of the duplicate key is returned with correct results. This is due to the incorrect assumption that if a key in a batch is equal to the largest key in the file, the next key cannot be present in that file. Tests: Add a new unit test Pull Request resolved: https://github.com/facebook/rocksdb/pull/6953 Reviewed By: cheng-chang Differential Revision: D21935898 Pulled By: anand1976 fbshipit-source-id: a2cc327a15150e23fd997546ca64d1c33021cb4c
2024-11-25 14:31:35 +00:00 · 2020-06-08 16:08:31 -07:00 · 2020-06-08 16:08:31 -07:00 · 1fb3593f25
parent f5e649453c
commit 1fb3593f25
3 changed files with 86 additions and 11 deletions
--- a/HISTORY.md
+++ b/HISTORY.md
@ -14,6 +14,7 @@
 * Fix potential file descriptor leakage in PosixEnv's IsDirectory() and NewRandomAccessFile().
 * Fix false negative from the VerifyChecksum() API when there is a checksum mismatch in an index partition block in a BlockBasedTable format table file (index_type is kTwoLevelIndexSearch).
 * Fix sst_dump to return non-zero exit code if the specified file is not a recognized SST file or fails requested checks.
+* Fix incorrect results from batched MultiGet for duplicate keys, when the duplicate key matches the largest key of an SST file and the value type for the key in the file is a merge value.

 ### Public API Change
 * Flush(..., column_family) may return Status::ColumnFamilyDropped() instead of Status::InvalidArgument() if column_family is dropped while processing the flush request.
--- a/db/db_basic_test.cc
+++ b/db/db_basic_test.cc
@ -9,6 +9,7 @@

 #include "db/db_test_util.h"
 #include "port/stack_trace.h"
+#include "rocksdb/merge_operator.h"
 #include "rocksdb/perf_context.h"
 #include "rocksdb/utilities/debug.h"
 #include "table/block_based/block_based_table_reader.h"
@ -17,6 +18,8 @@
 #if !defined(ROCKSDB_LITE)
 #include "test_util/sync_point.h"
 #endif
+#include "utilities/merge_operators.h"
+#include "utilities/merge_operators/string_append/stringappend.h"

 namespace ROCKSDB_NAMESPACE {

@ -1340,6 +1343,57 @@ TEST_F(DBBasicTest, MultiGetBatchedSortedMultiFile) {
  } while (ChangeOptions());
 }

+TEST_F(DBBasicTest, MultiGetBatchedDuplicateKeys) {
+  Options opts = CurrentOptions();
+  opts.merge_operator = MergeOperators::CreateStringAppendOperator();
+  CreateAndReopenWithCF({"pikachu"}, opts);
+  SetPerfLevel(kEnableCount);
+  // To expand the power of this test, generate > 1 table file and
+  // mix with memtable
+  ASSERT_OK(Merge(1, "k1", "v1"));
+  ASSERT_OK(Merge(1, "k2", "v2"));
+  Flush(1);
+  MoveFilesToLevel(2, 1);
+  ASSERT_OK(Merge(1, "k3", "v3"));
+  ASSERT_OK(Merge(1, "k4", "v4"));
+  Flush(1);
+  MoveFilesToLevel(2, 1);
+  ASSERT_OK(Merge(1, "k4", "v4_2"));
+  ASSERT_OK(Merge(1, "k6", "v6"));
+  Flush(1);
+  MoveFilesToLevel(2, 1);
+  ASSERT_OK(Merge(1, "k7", "v7"));
+  ASSERT_OK(Merge(1, "k8", "v8"));
+  Flush(1);
+  MoveFilesToLevel(2, 1);
+
+  get_perf_context()->Reset();
+
+  std::vector<Slice> keys({"k8", "k8", "k8", "k4", "k4", "k1", "k3"});
+  std::vector<PinnableSlice> values(keys.size());
+  std::vector<ColumnFamilyHandle*> cfs(keys.size(), handles_[1]);
+  std::vector<Status> s(keys.size());
+
+  db_->MultiGet(ReadOptions(), handles_[1], keys.size(), keys.data(),
+                values.data(), s.data(), false);
+
+  ASSERT_EQ(values.size(), keys.size());
+  ASSERT_EQ(std::string(values[0].data(), values[0].size()), "v8");
+  ASSERT_EQ(std::string(values[1].data(), values[1].size()), "v8");
+  ASSERT_EQ(std::string(values[2].data(), values[2].size()), "v8");
+  ASSERT_EQ(std::string(values[3].data(), values[3].size()), "v4,v4_2");
+  ASSERT_EQ(std::string(values[4].data(), values[4].size()), "v4,v4_2");
+  ASSERT_EQ(std::string(values[5].data(), values[5].size()), "v1");
+  ASSERT_EQ(std::string(values[6].data(), values[6].size()), "v3");
+  ASSERT_EQ(24, (int)get_perf_context()->multiget_read_bytes);
+
+  for (Status& status : s) {
+    ASSERT_OK(status);
+  }
+
+  SetPerfLevel(kDisable);
+}
+
 TEST_F(DBBasicTest, MultiGetBatchedMultiLevel) {
  Options options = CurrentOptions();
  options.disable_auto_compactions = true;
--- a/db/version_set.cc
+++ b/db/version_set.cc
@ -365,6 +365,7 @@ class FilePickerMultiGet {
        range_(range),
        batch_iter_(range->begin()),
        batch_iter_prev_(range->begin()),
+        upper_key_(range->begin()),
        maybe_repeat_key_(false),
        current_level_range_(*range, range->begin(), range->end()),
        current_file_range_(*range, range->begin(), range->end()),
@ -481,9 +482,19 @@ class FilePickerMultiGet {
      }
      if (cmp_largest == 0) {
        // cmp_largest is 0, which means the next key will not be in this
-        // file, so stop looking further. Also don't increment megt_iter_
-        // as we may have to look for this key in the next file if we don't
-        // find it in this one
+        // file, so stop looking further. However, its possible there are
+        // duplicates in the batch, so find the upper bound for the batch
+        // in this file (upper_key_) by skipping past the duplicates. We
+        // leave batch_iter_ as is since we may have to pick up from there
+        // for the next file, if this file has a merge value rather than
+        // final value
+        upper_key_ = batch_iter_;
+        ++upper_key_;
+        while (upper_key_ != current_level_range_.end() &&
+               user_comparator_->Compare(batch_iter_->ukey, upper_key_->ukey) ==
+                   0) {
+          ++upper_key_;
+        }
        break;
      } else {
        if (curr_level_ == 0) {
@ -503,6 +514,12 @@ class FilePickerMultiGet {
    *fd = f;
    *file_index = curr_file_index;
    *is_last_key_in_file = cmp_largest == 0;
+    if (!*is_last_key_in_file) {
+      // If the largest key in the batch overlapping the file is not the
+      // largest key in the file, upper_ley_ would not have been updated so
+      // update it here
+      upper_key_ = batch_iter_;
+    }
    return file_hit;
  }

@ -524,7 +541,7 @@ class FilePickerMultiGet {
          // file regardless for all keys not found yet
          if (current_level_range_.CheckKeyDone(batch_iter_) ||
              curr_level_ == 0) {
-            ++batch_iter_;
+            batch_iter_ = upper_key_;
          }
        }
        // batch_iter_prev_ will become the start key for the next file
@ -544,18 +561,20 @@ class FilePickerMultiGet {
                                      &is_last_key_in_file)) {
        search_ended_ = !PrepareNextLevel();
      } else {
-        MultiGetRange::Iterator upper_key = batch_iter_;
        if (is_last_key_in_file) {
          // Since cmp_largest is 0, batch_iter_ still points to the last key
          // that falls in this file, instead of the next one. Increment
-          // upper_key so we can set the range properly for SST MultiGet
-          ++upper_key;
-          ++(fp_ctx_array_[batch_iter_.index()].curr_index_in_curr_level);
+          // the file index for all keys between batch_iter_ and upper_key_
+          auto tmp_iter = batch_iter_;
+          while (tmp_iter != upper_key_) {
+            ++(fp_ctx_array_[tmp_iter.index()].curr_index_in_curr_level);
+            ++tmp_iter;
+          }
          maybe_repeat_key_ = true;
        }
        // Set the range for this file
        current_file_range_ =
-            MultiGetRange(next_file_range, batch_iter_prev_, upper_key);
+            MultiGetRange(next_file_range, batch_iter_prev_, upper_key_);
        returned_file_level_ = curr_level_;
        hit_file_level_ = curr_level_;
        is_hit_file_last_in_level_ =
@ -607,6 +626,7 @@ class FilePickerMultiGet {
  // key found in the previous SST file, in order to serve as the start of
  // the batch key range for the next SST file
  MultiGetRange::Iterator batch_iter_prev_;
+  MultiGetRange::Iterator upper_key_;
  bool maybe_repeat_key_;
  MultiGetRange current_level_range_;
  MultiGetRange current_file_range_;
@ -626,7 +646,7 @@ class FilePickerMultiGet {
      if (fp_ctx_array_[mget_iter.index()].curr_index_in_curr_level <
          curr_file_level_->num_files) {
        batch_iter_prev_ = current_level_range_.begin();
-        batch_iter_ = current_level_range_.begin();
+        upper_key_ = batch_iter_ = current_level_range_.begin();
        return true;
      }
    }
@ -721,7 +741,7 @@ class FilePickerMultiGet {
      }
      if (level_contains_keys) {
        batch_iter_prev_ = current_level_range_.begin();
-        batch_iter_ = current_level_range_.begin();
+        upper_key_ = batch_iter_ = current_level_range_.begin();
        return true;
      }
      curr_level_++;