rocksdb/db/blob/db_blob_index_test.cc

598 lines
20 KiB
C++
Raw Normal View History

// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include <functional>
#include <string>
#include <utility>
#include <vector>
#include "db/arena_wrapped_db_iter.h"
#include "db/blob/blob_index.h"
#include "db/column_family.h"
#include "db/db_iter.h"
#include "db/db_test_util.h"
#include "db/dbformat.h"
#include "db/write_batch_internal.h"
#include "port/port.h"
#include "port/stack_trace.h"
#include "util/string_util.h"
#include "utilities/merge_operators.h"
namespace ROCKSDB_NAMESPACE {
// kTypeBlobIndex is a value type used by BlobDB only. The base rocksdb
// should accept the value type on write, and report not supported value
// for reads, unless caller request for it explicitly. The base rocksdb
// doesn't understand format of actual blob index (the value).
class DBBlobIndexTest : public DBTestBase {
public:
enum Tier {
kMemtable = 0,
kImmutableMemtables = 1,
kL0SstFile = 2,
kLnSstFile = 3,
};
const std::vector<Tier> kAllTiers = {Tier::kMemtable,
Tier::kImmutableMemtables,
Tier::kL0SstFile, Tier::kLnSstFile};
DBBlobIndexTest() : DBTestBase("db_blob_index_test", /*env_do_fsync=*/true) {}
ColumnFamilyHandle* cfh() { return dbfull()->DefaultColumnFamily(); }
Access DBImpl* and CFD* by CFHImpl* in Iterators (#12395) Summary: In the current implementation of iterators, `DBImpl*` and `ColumnFamilyData*` are held in `DBIter` and `ArenaWrappedDBIter` for two purposes: tracing and Refresh() API. With the introduction of a new iterator called MultiCfIterator in PR https://github.com/facebook/rocksdb/issues/12153 , which is a cross-column-family iterator that maintains multiple DBIters as child iterators from a consistent database state, we need to make some changes to the existing implementation. The new iterator will still be exposed through the generic Iterator interface with an additional capability to return AttributeGroups (via `attribute_groups()`) which is a list of wide columns grouped by column family. For more information about AttributeGroup, please refer to previous PRs: https://github.com/facebook/rocksdb/issues/11925 #11943, and https://github.com/facebook/rocksdb/issues/11977. To be able to return AttributeGroup in the default single CF iterator created, access to `ColumnFamilyHandle*` within `DBIter` is necessary. However, this is not currently available in `DBIter`. Since `DBImpl*` and `ColumnFamilyData*` can be easily accessed via `ColumnFamilyHandleImpl*`, we have decided to replace the pointers to `ColumnFamilyData` and `DBImpl` in `DBIter` with a pointer to `ColumnFamilyHandleImpl`. Pull Request resolved: https://github.com/facebook/rocksdb/pull/12395 Test Plan: # Summary In the current implementation of iterators, `DBImpl*` and `ColumnFamilyData*` are held in `DBIter` and `ArenaWrappedDBIter` for two purposes: tracing and Refresh() API. With the introduction of a new iterator called MultiCfIterator in PR #12153 , which is a cross-column-family iterator that maintains multiple DBIters as child iterators from a consistent database state, we need to make some changes to the existing implementation. The new iterator will still be exposed through the generic Iterator interface with an additional capability to return AttributeGroups (via `attribute_groups()`) which is a list of wide columns grouped by column family. For more information about AttributeGroup, please refer to previous PRs: #11925 #11943, and #11977. To be able to return AttributeGroup in the default single CF iterator created, access to `ColumnFamilyHandle*` within `DBIter` is necessary. However, this is not currently available in `DBIter`. Since `DBImpl*` and `ColumnFamilyData*` can be easily accessed via `ColumnFamilyHandleImpl*`, we have decided to replace the pointers to `ColumnFamilyData` and `DBImpl` in `DBIter` with a pointer to `ColumnFamilyHandleImpl`. # Test Plan There should be no behavior changes. Existing tests and CI for the correctness tests. **Test for Perf Regression** Build ``` $> make -j64 release ``` Setup ``` $> TEST_TMPDIR=/dev/shm/db_bench ./db_bench -benchmarks="filluniquerandom" -key_size=32 -value_size=512 -num=1000000 -compression_type=none ``` Run ``` TEST_TMPDIR=/dev/shm/db_bench ./db_bench -use_existing_db=1 -benchmarks="newiterator,seekrandom" -cache_size=10485760000 ``` Before the change ``` DB path: [/dev/shm/db_bench/dbbench] newiterator : 0.552 micros/op 1810157 ops/sec 0.552 seconds 1000000 operations; DB path: [/dev/shm/db_bench/dbbench] seekrandom : 4.502 micros/op 222143 ops/sec 4.502 seconds 1000000 operations; (0 of 1000000 found) ``` After the change ``` DB path: [/dev/shm/db_bench/dbbench] newiterator : 0.520 micros/op 1924401 ops/sec 0.520 seconds 1000000 operations; DB path: [/dev/shm/db_bench/dbbench] seekrandom : 4.532 micros/op 220657 ops/sec 4.532 seconds 1000000 operations; (0 of 1000000 found) ``` Reviewed By: pdillinger Differential Revision: D54332713 Pulled By: jaykorean fbshipit-source-id: b28d897ad519e58b1ca82eb068a6319544a4fae5
2024-03-01 18:28:20 +00:00
ColumnFamilyHandleImpl* cfh_impl() {
return static_cast_with_check<ColumnFamilyHandleImpl>(cfh());
}
Access DBImpl* and CFD* by CFHImpl* in Iterators (#12395) Summary: In the current implementation of iterators, `DBImpl*` and `ColumnFamilyData*` are held in `DBIter` and `ArenaWrappedDBIter` for two purposes: tracing and Refresh() API. With the introduction of a new iterator called MultiCfIterator in PR https://github.com/facebook/rocksdb/issues/12153 , which is a cross-column-family iterator that maintains multiple DBIters as child iterators from a consistent database state, we need to make some changes to the existing implementation. The new iterator will still be exposed through the generic Iterator interface with an additional capability to return AttributeGroups (via `attribute_groups()`) which is a list of wide columns grouped by column family. For more information about AttributeGroup, please refer to previous PRs: https://github.com/facebook/rocksdb/issues/11925 #11943, and https://github.com/facebook/rocksdb/issues/11977. To be able to return AttributeGroup in the default single CF iterator created, access to `ColumnFamilyHandle*` within `DBIter` is necessary. However, this is not currently available in `DBIter`. Since `DBImpl*` and `ColumnFamilyData*` can be easily accessed via `ColumnFamilyHandleImpl*`, we have decided to replace the pointers to `ColumnFamilyData` and `DBImpl` in `DBIter` with a pointer to `ColumnFamilyHandleImpl`. Pull Request resolved: https://github.com/facebook/rocksdb/pull/12395 Test Plan: # Summary In the current implementation of iterators, `DBImpl*` and `ColumnFamilyData*` are held in `DBIter` and `ArenaWrappedDBIter` for two purposes: tracing and Refresh() API. With the introduction of a new iterator called MultiCfIterator in PR #12153 , which is a cross-column-family iterator that maintains multiple DBIters as child iterators from a consistent database state, we need to make some changes to the existing implementation. The new iterator will still be exposed through the generic Iterator interface with an additional capability to return AttributeGroups (via `attribute_groups()`) which is a list of wide columns grouped by column family. For more information about AttributeGroup, please refer to previous PRs: #11925 #11943, and #11977. To be able to return AttributeGroup in the default single CF iterator created, access to `ColumnFamilyHandle*` within `DBIter` is necessary. However, this is not currently available in `DBIter`. Since `DBImpl*` and `ColumnFamilyData*` can be easily accessed via `ColumnFamilyHandleImpl*`, we have decided to replace the pointers to `ColumnFamilyData` and `DBImpl` in `DBIter` with a pointer to `ColumnFamilyHandleImpl`. # Test Plan There should be no behavior changes. Existing tests and CI for the correctness tests. **Test for Perf Regression** Build ``` $> make -j64 release ``` Setup ``` $> TEST_TMPDIR=/dev/shm/db_bench ./db_bench -benchmarks="filluniquerandom" -key_size=32 -value_size=512 -num=1000000 -compression_type=none ``` Run ``` TEST_TMPDIR=/dev/shm/db_bench ./db_bench -use_existing_db=1 -benchmarks="newiterator,seekrandom" -cache_size=10485760000 ``` Before the change ``` DB path: [/dev/shm/db_bench/dbbench] newiterator : 0.552 micros/op 1810157 ops/sec 0.552 seconds 1000000 operations; DB path: [/dev/shm/db_bench/dbbench] seekrandom : 4.502 micros/op 222143 ops/sec 4.502 seconds 1000000 operations; (0 of 1000000 found) ``` After the change ``` DB path: [/dev/shm/db_bench/dbbench] newiterator : 0.520 micros/op 1924401 ops/sec 0.520 seconds 1000000 operations; DB path: [/dev/shm/db_bench/dbbench] seekrandom : 4.532 micros/op 220657 ops/sec 4.532 seconds 1000000 operations; (0 of 1000000 found) ``` Reviewed By: pdillinger Differential Revision: D54332713 Pulled By: jaykorean fbshipit-source-id: b28d897ad519e58b1ca82eb068a6319544a4fae5
2024-03-01 18:28:20 +00:00
ColumnFamilyData* cfd() { return cfh_impl()->cfd(); }
Status PutBlobIndex(WriteBatch* batch, const Slice& key,
const Slice& blob_index) {
return WriteBatchInternal::PutBlobIndex(batch, cfd()->GetID(), key,
blob_index);
}
Status Write(WriteBatch* batch) {
return dbfull()->Write(WriteOptions(), batch);
}
std::string GetImpl(const Slice& key, bool* is_blob_index = nullptr,
const Snapshot* snapshot = nullptr) {
ReadOptions read_options;
read_options.snapshot = snapshot;
PinnableSlice value;
New API to get all merge operands for a Key (#5604) Summary: This is a new API added to db.h to allow for fetching all merge operands associated with a Key. The main motivation for this API is to support use cases where doing a full online merge is not necessary as it is performance sensitive. Example use-cases: 1. Update subset of columns and read subset of columns - Imagine a SQL Table, a row is encoded as a K/V pair (as it is done in MyRocks). If there are many columns and users only updated one of them, we can use merge operator to reduce write amplification. While users only read one or two columns in the read query, this feature can avoid a full merging of the whole row, and save some CPU. 2. Updating very few attributes in a value which is a JSON-like document - Updating one attribute can be done efficiently using merge operator, while reading back one attribute can be done more efficiently if we don't need to do a full merge. ---------------------------------------------------------------------------------------------------- API : Status GetMergeOperands( const ReadOptions& options, ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* merge_operands, GetMergeOperandsOptions* get_merge_operands_options, int* number_of_operands) Example usage : int size = 100; int number_of_operands = 0; std::vector<PinnableSlice> values(size); GetMergeOperandsOptions merge_operands_info; db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(), merge_operands_info, &number_of_operands); Description : Returns all the merge operands corresponding to the key. If the number of merge operands in DB is greater than merge_operands_options.expected_max_number_of_operands no merge operands are returned and status is Incomplete. Merge operands returned are in the order of insertion. merge_operands-> Points to an array of at-least merge_operands_options.expected_max_number_of_operands and the caller is responsible for allocating it. If the status returned is Incomplete then number_of_operands will contain the total number of merge operands found in DB for key. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5604 Test Plan: Added unit test and perf test in db_bench that can be run using the command: ./db_bench -benchmarks=getmergeoperands --merge_operator=sortlist Differential Revision: D16657366 Pulled By: vjnadimpalli fbshipit-source-id: 0faadd752351745224ee12d4ae9ef3cb529951bf
2019-08-06 21:22:34 +00:00
DBImpl::GetImplOptions get_impl_options;
get_impl_options.column_family = cfh();
get_impl_options.value = &value;
get_impl_options.is_blob_index = is_blob_index;
auto s = dbfull()->GetImpl(read_options, key, get_impl_options);
if (s.IsNotFound()) {
return "NOT_FOUND";
}
if (s.IsCorruption()) {
return "CORRUPTION";
}
if (s.IsNotSupported()) {
return "NOT_SUPPORTED";
}
if (!s.ok()) {
return s.ToString();
}
return value.ToString();
}
std::string GetBlobIndex(const Slice& key,
const Snapshot* snapshot = nullptr) {
bool is_blob_index = false;
std::string value = GetImpl(key, &is_blob_index, snapshot);
if (!is_blob_index) {
return "NOT_BLOB";
}
return value;
}
ArenaWrappedDBIter* GetBlobIterator() {
Track full_history_ts_low per SuperVersion (#11784) Summary: As discussed in https://github.com/facebook/rocksdb/issues/11730 , this PR tracks the effective `full_history_ts_low` per SuperVersion and update existing sanity checks for `ReadOptions.timestamp >= full_history_ts_low` to use this per SuperVersion `full_history_ts_low` instead. This also means the check is moved to happen after acquiring SuperVersion. There are two motivations for this: 1) Each time `full_history_ts_low` really come into effect to collapse history, a new SuperVersion is always installed, because it would involve either a Flush or Compaction, both of which change the LSM tree shape. We can take advantage of this to ensure that as long as this sanity check is passed, even if `full_history_ts_low` can be concurrently increased and collapse some history above the requested `ReadOptions.timestamp`, a read request won’t have visibility to that part of history through this SuperVersion that it already acquired. 2) the existing sanity check uses `ColumnFamilyData::GetFullHistoryTsLow` without locking the db mutex, which is the mutex all `IncreaseFullHistoryTsLow` operation is using when mutating this field. So there is a race condition. This also solve the race condition on the read path. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11784 Test Plan: `make all check` // Checks success scenario really provide the read consistency attribute as mentioned above. `./db_with_timestamp_basic_test --gtest_filter=*FullHistoryTsLowSanityCheckPassReadIsConsistent*` // Checks failure scenario cleans up SuperVersion properly. `./db_with_timestamp_basic_test --gtest_filter=*FullHistoryTsLowSanityCheckFail*` `./db_secondary_test --gtest_filter=*FullHistoryTsLowSanityCheckFail*` `./db_readonly_with_timestamp_test --gtest_filter=*FullHistoryTsLowSanitchCheckFail*` Reviewed By: ltamasi Differential Revision: D48894795 Pulled By: jowlyzhang fbshipit-source-id: 1f801fe8e1bc8e63ca76c03cbdbd0974e5ff5bf6
2023-09-13 23:34:18 +00:00
DBImpl* db_impl = dbfull();
return db_impl->NewIteratorImpl(
Access DBImpl* and CFD* by CFHImpl* in Iterators (#12395) Summary: In the current implementation of iterators, `DBImpl*` and `ColumnFamilyData*` are held in `DBIter` and `ArenaWrappedDBIter` for two purposes: tracing and Refresh() API. With the introduction of a new iterator called MultiCfIterator in PR https://github.com/facebook/rocksdb/issues/12153 , which is a cross-column-family iterator that maintains multiple DBIters as child iterators from a consistent database state, we need to make some changes to the existing implementation. The new iterator will still be exposed through the generic Iterator interface with an additional capability to return AttributeGroups (via `attribute_groups()`) which is a list of wide columns grouped by column family. For more information about AttributeGroup, please refer to previous PRs: https://github.com/facebook/rocksdb/issues/11925 #11943, and https://github.com/facebook/rocksdb/issues/11977. To be able to return AttributeGroup in the default single CF iterator created, access to `ColumnFamilyHandle*` within `DBIter` is necessary. However, this is not currently available in `DBIter`. Since `DBImpl*` and `ColumnFamilyData*` can be easily accessed via `ColumnFamilyHandleImpl*`, we have decided to replace the pointers to `ColumnFamilyData` and `DBImpl` in `DBIter` with a pointer to `ColumnFamilyHandleImpl`. Pull Request resolved: https://github.com/facebook/rocksdb/pull/12395 Test Plan: # Summary In the current implementation of iterators, `DBImpl*` and `ColumnFamilyData*` are held in `DBIter` and `ArenaWrappedDBIter` for two purposes: tracing and Refresh() API. With the introduction of a new iterator called MultiCfIterator in PR #12153 , which is a cross-column-family iterator that maintains multiple DBIters as child iterators from a consistent database state, we need to make some changes to the existing implementation. The new iterator will still be exposed through the generic Iterator interface with an additional capability to return AttributeGroups (via `attribute_groups()`) which is a list of wide columns grouped by column family. For more information about AttributeGroup, please refer to previous PRs: #11925 #11943, and #11977. To be able to return AttributeGroup in the default single CF iterator created, access to `ColumnFamilyHandle*` within `DBIter` is necessary. However, this is not currently available in `DBIter`. Since `DBImpl*` and `ColumnFamilyData*` can be easily accessed via `ColumnFamilyHandleImpl*`, we have decided to replace the pointers to `ColumnFamilyData` and `DBImpl` in `DBIter` with a pointer to `ColumnFamilyHandleImpl`. # Test Plan There should be no behavior changes. Existing tests and CI for the correctness tests. **Test for Perf Regression** Build ``` $> make -j64 release ``` Setup ``` $> TEST_TMPDIR=/dev/shm/db_bench ./db_bench -benchmarks="filluniquerandom" -key_size=32 -value_size=512 -num=1000000 -compression_type=none ``` Run ``` TEST_TMPDIR=/dev/shm/db_bench ./db_bench -use_existing_db=1 -benchmarks="newiterator,seekrandom" -cache_size=10485760000 ``` Before the change ``` DB path: [/dev/shm/db_bench/dbbench] newiterator : 0.552 micros/op 1810157 ops/sec 0.552 seconds 1000000 operations; DB path: [/dev/shm/db_bench/dbbench] seekrandom : 4.502 micros/op 222143 ops/sec 4.502 seconds 1000000 operations; (0 of 1000000 found) ``` After the change ``` DB path: [/dev/shm/db_bench/dbbench] newiterator : 0.520 micros/op 1924401 ops/sec 0.520 seconds 1000000 operations; DB path: [/dev/shm/db_bench/dbbench] seekrandom : 4.532 micros/op 220657 ops/sec 4.532 seconds 1000000 operations; (0 of 1000000 found) ``` Reviewed By: pdillinger Differential Revision: D54332713 Pulled By: jaykorean fbshipit-source-id: b28d897ad519e58b1ca82eb068a6319544a4fae5
2024-03-01 18:28:20 +00:00
ReadOptions(), cfh_impl(), cfd()->GetReferencedSuperVersion(db_impl),
Track full_history_ts_low per SuperVersion (#11784) Summary: As discussed in https://github.com/facebook/rocksdb/issues/11730 , this PR tracks the effective `full_history_ts_low` per SuperVersion and update existing sanity checks for `ReadOptions.timestamp >= full_history_ts_low` to use this per SuperVersion `full_history_ts_low` instead. This also means the check is moved to happen after acquiring SuperVersion. There are two motivations for this: 1) Each time `full_history_ts_low` really come into effect to collapse history, a new SuperVersion is always installed, because it would involve either a Flush or Compaction, both of which change the LSM tree shape. We can take advantage of this to ensure that as long as this sanity check is passed, even if `full_history_ts_low` can be concurrently increased and collapse some history above the requested `ReadOptions.timestamp`, a read request won’t have visibility to that part of history through this SuperVersion that it already acquired. 2) the existing sanity check uses `ColumnFamilyData::GetFullHistoryTsLow` without locking the db mutex, which is the mutex all `IncreaseFullHistoryTsLow` operation is using when mutating this field. So there is a race condition. This also solve the race condition on the read path. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11784 Test Plan: `make all check` // Checks success scenario really provide the read consistency attribute as mentioned above. `./db_with_timestamp_basic_test --gtest_filter=*FullHistoryTsLowSanityCheckPassReadIsConsistent*` // Checks failure scenario cleans up SuperVersion properly. `./db_with_timestamp_basic_test --gtest_filter=*FullHistoryTsLowSanityCheckFail*` `./db_secondary_test --gtest_filter=*FullHistoryTsLowSanityCheckFail*` `./db_readonly_with_timestamp_test --gtest_filter=*FullHistoryTsLowSanitchCheckFail*` Reviewed By: ltamasi Differential Revision: D48894795 Pulled By: jowlyzhang fbshipit-source-id: 1f801fe8e1bc8e63ca76c03cbdbd0974e5ff5bf6
2023-09-13 23:34:18 +00:00
db_impl->GetLatestSequenceNumber(), nullptr /*read_callback*/,
true /*expose_blob_index*/);
}
Options GetTestOptions() {
Options options;
Fix many tests to run with MEM_ENV and ENCRYPTED_ENV; Introduce a MemoryFileSystem class (#7566) Summary: This PR does a few things: 1. The MockFileSystem class was split out from the MockEnv. This change would theoretically allow a MockFileSystem to be used by other Environments as well (if we created a means of constructing one). The MockFileSystem implements a FileSystem in its entirety and does not rely on any Wrapper implementation. 2. Make the RocksDB test suite work when MOCK_ENV=1 and ENCRYPTED_ENV=1 are set. To accomplish this, a few things were needed: - The tests that tried to use the "wrong" environment (Env::Default() instead of env_) were updated - The MockFileSystem was changed to support the features it was missing or mishandled (such as recursively deleting files in a directory or supporting renaming of a directory). 3. Updated the test framework to have a ROCKSDB_GTEST_SKIP macro. This can be used to flag tests that are skipped. Currently, this defaults to doing nothing (marks the test as SUCCESS) but will mark the tests as SKIPPED when RocksDB is upgraded to a version of gtest that supports this (gtest-1.10). I have run a full "make check" with MEM_ENV, ENCRYPTED_ENV, both, and neither under both MacOS and RedHat. A few tests were disabled/skipped for the MEM/ENCRYPTED cases. The error_handler_fs_test fails/hangs for MEM_ENV (presumably a timing problem) and I will introduce another PR/issue to track that problem. (I will also push a change to disable those tests soon). There is one more test in DBTest2 that also fails which I need to investigate or skip before this PR is merged. Theoretically, this PR should also allow the test suite to run against an Env loaded from the registry, though I do not have one to try it with currently. Finally, once this is accepted, it would be nice if there was a CircleCI job to run these tests on a checkin so this effort does not become stale. I do not know how to do that, so if someone could write that job, it would be appreciated :) Pull Request resolved: https://github.com/facebook/rocksdb/pull/7566 Reviewed By: zhichao-cao Differential Revision: D24408980 Pulled By: jay-zhuang fbshipit-source-id: 911b1554a4d0da06fd51feca0c090a4abdcb4a5f
2020-10-27 17:31:34 +00:00
options.env = CurrentOptions().env;
options.create_if_missing = true;
options.num_levels = 2;
options.disable_auto_compactions = true;
// Disable auto flushes.
options.max_write_buffer_number = 10;
options.min_write_buffer_number_to_merge = 10;
options.merge_operator = MergeOperators::CreateStringAppendOperator();
return options;
}
void MoveDataTo(Tier tier) {
switch (tier) {
case Tier::kMemtable:
break;
case Tier::kImmutableMemtables:
ASSERT_OK(dbfull()->TEST_SwitchMemtable());
break;
case Tier::kL0SstFile:
ASSERT_OK(Flush());
break;
case Tier::kLnSstFile:
ASSERT_OK(Flush());
ASSERT_OK(Put("a", "dummy"));
ASSERT_OK(Put("z", "dummy"));
ASSERT_OK(Flush());
ASSERT_OK(
dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
ASSERT_EQ("0,1", FilesPerLevel());
break;
}
}
};
// Note: the following test case pertains to the StackableDB-based BlobDB
// implementation. We should be able to write kTypeBlobIndex to memtables and
// SST files.
TEST_F(DBBlobIndexTest, Write) {
for (auto tier : kAllTiers) {
DestroyAndReopen(GetTestOptions());
std::vector<std::pair<std::string, std::string>> key_values;
constexpr size_t num_key_values = 5;
key_values.reserve(num_key_values);
for (size_t i = 1; i <= num_key_values; ++i) {
std::string key = "key" + std::to_string(i);
std::string blob_index;
BlobIndex::EncodeInlinedTTL(&blob_index, /* expiration */ 9876543210,
"blob" + std::to_string(i));
key_values.emplace_back(std::move(key), std::move(blob_index));
}
for (const auto& key_value : key_values) {
WriteBatch batch;
ASSERT_OK(PutBlobIndex(&batch, key_value.first, key_value.second));
ASSERT_OK(Write(&batch));
}
MoveDataTo(tier);
for (const auto& key_value : key_values) {
ASSERT_EQ(GetBlobIndex(key_value.first), key_value.second);
}
}
}
// Note: the following test case pertains to the StackableDB-based BlobDB
// implementation. Get should be able to return blob index if is_blob_index is
// provided, otherwise it should return Status::NotSupported (when reading from
// memtable) or Status::Corruption (when reading from SST). Reading from SST
// returns Corruption because we can't differentiate between the application
// accidentally opening the base DB of a stacked BlobDB and actual corruption
// when using the integrated BlobDB.
TEST_F(DBBlobIndexTest, Get) {
std::string blob_index;
BlobIndex::EncodeInlinedTTL(&blob_index, /* expiration */ 9876543210, "blob");
for (auto tier : kAllTiers) {
DestroyAndReopen(GetTestOptions());
WriteBatch batch;
ASSERT_OK(batch.Put("key", "value"));
ASSERT_OK(PutBlobIndex(&batch, "blob_key", blob_index));
ASSERT_OK(Write(&batch));
MoveDataTo(tier);
// Verify normal value
bool is_blob_index = false;
PinnableSlice value;
ASSERT_EQ("value", Get("key"));
ASSERT_EQ("value", GetImpl("key"));
ASSERT_EQ("value", GetImpl("key", &is_blob_index));
ASSERT_FALSE(is_blob_index);
// Verify blob index
if (tier <= kImmutableMemtables) {
ASSERT_TRUE(Get("blob_key", &value).IsNotSupported());
ASSERT_EQ("NOT_SUPPORTED", GetImpl("blob_key"));
} else {
ASSERT_TRUE(Get("blob_key", &value).IsCorruption());
ASSERT_EQ("CORRUPTION", GetImpl("blob_key"));
}
ASSERT_EQ(blob_index, GetImpl("blob_key", &is_blob_index));
ASSERT_TRUE(is_blob_index);
}
}
// Note: the following test case pertains to the StackableDB-based BlobDB
// implementation. Get should NOT return Status::NotSupported/Status::Corruption
// if blob index is updated with a normal value. See the test case above for
// more details.
TEST_F(DBBlobIndexTest, Updated) {
std::string blob_index;
BlobIndex::EncodeInlinedTTL(&blob_index, /* expiration */ 9876543210, "blob");
for (auto tier : kAllTiers) {
DestroyAndReopen(GetTestOptions());
WriteBatch batch;
for (int i = 0; i < 10; i++) {
ASSERT_OK(PutBlobIndex(&batch, "key" + std::to_string(i), blob_index));
}
ASSERT_OK(Write(&batch));
// Avoid blob values from being purged.
const Snapshot* snapshot = dbfull()->GetSnapshot();
ASSERT_OK(Put("key1", "new_value"));
ASSERT_OK(Merge("key2", "a"));
ASSERT_OK(Merge("key2", "b"));
ASSERT_OK(Merge("key2", "c"));
ASSERT_OK(Delete("key3"));
ASSERT_OK(SingleDelete("key4"));
ASSERT_OK(Delete("key5"));
ASSERT_OK(Merge("key5", "a"));
ASSERT_OK(Merge("key5", "b"));
ASSERT_OK(Merge("key5", "c"));
ASSERT_OK(dbfull()->DeleteRange(WriteOptions(), cfh(), "key6", "key9"));
MoveDataTo(tier);
for (int i = 0; i < 10; i++) {
ASSERT_EQ(blob_index, GetBlobIndex("key" + std::to_string(i), snapshot));
}
ASSERT_EQ("new_value", Get("key1"));
if (tier <= kImmutableMemtables) {
ASSERT_EQ("NOT_SUPPORTED", GetImpl("key2"));
} else {
ASSERT_EQ("CORRUPTION", GetImpl("key2"));
}
ASSERT_EQ("NOT_FOUND", Get("key3"));
ASSERT_EQ("NOT_FOUND", Get("key4"));
ASSERT_EQ("a,b,c", GetImpl("key5"));
for (int i = 6; i < 9; i++) {
ASSERT_EQ("NOT_FOUND", Get("key" + std::to_string(i)));
}
ASSERT_EQ(blob_index, GetBlobIndex("key9"));
dbfull()->ReleaseSnapshot(snapshot);
}
}
// Note: the following test case pertains to the StackableDB-based BlobDB
// implementation. When a blob iterator is used, it should set the
// expose_blob_index flag for the underlying DBIter, and retrieve/return the
// corresponding blob value. If a regular DBIter is created (i.e.
// expose_blob_index is not set), it should return Status::Corruption.
TEST_F(DBBlobIndexTest, Iterate) {
const std::vector<std::vector<ValueType>> data = {
/*00*/ {kTypeValue},
/*01*/ {kTypeBlobIndex},
/*02*/ {kTypeValue},
/*03*/ {kTypeBlobIndex, kTypeValue},
/*04*/ {kTypeValue},
/*05*/ {kTypeValue, kTypeBlobIndex},
/*06*/ {kTypeValue},
/*07*/ {kTypeDeletion, kTypeBlobIndex},
/*08*/ {kTypeValue},
/*09*/ {kTypeSingleDeletion, kTypeBlobIndex},
/*10*/ {kTypeValue},
/*11*/ {kTypeMerge, kTypeMerge, kTypeMerge, kTypeBlobIndex},
/*12*/ {kTypeValue},
/*13*/
{kTypeMerge, kTypeMerge, kTypeMerge, kTypeDeletion, kTypeBlobIndex},
/*14*/ {kTypeValue},
/*15*/ {kTypeBlobIndex},
/*16*/ {kTypeValue},
};
auto get_key = [](int index) {
char buf[20];
snprintf(buf, sizeof(buf), "%02d", index);
return "key" + std::string(buf);
};
auto get_value = [&](int index, int version) {
return get_key(index) + "_value" + std::to_string(version);
};
auto check_iterator = [&](Iterator* iterator, Status::Code expected_status,
const Slice& expected_value) {
ASSERT_EQ(expected_status, iterator->status().code());
if (expected_status == Status::kOk) {
ASSERT_TRUE(iterator->Valid());
ASSERT_EQ(expected_value, iterator->value());
} else {
ASSERT_FALSE(iterator->Valid());
}
};
auto create_normal_iterator = [&]() -> Iterator* {
return dbfull()->NewIterator(ReadOptions());
};
auto create_blob_iterator = [&]() -> Iterator* { return GetBlobIterator(); };
auto check_is_blob = [&](bool is_blob) {
return [is_blob](Iterator* iterator) {
Prefer static_cast in place of most reinterpret_cast (#12308) Summary: The following are risks associated with pointer-to-pointer reinterpret_cast: * Can produce the "wrong result" (crash or memory corruption). IIRC, in theory this can happen for any up-cast or down-cast for a non-standard-layout type, though in practice would only happen for multiple inheritance cases (where the base class pointer might be "inside" the derived object). We don't use multiple inheritance a lot, but we do. * Can mask useful compiler errors upon code change, including converting between unrelated pointer types that you are expecting to be related, and converting between pointer and scalar types unintentionally. I can only think of some obscure cases where static_cast could be troublesome when it compiles as a replacement: * Going through `void*` could plausibly cause unnecessary or broken pointer arithmetic. Suppose we have `struct Derived: public Base1, public Base2`. If we have `Derived*` -> `void*` -> `Base2*` -> `Derived*` through reinterpret casts, this could plausibly work (though technical UB) assuming the `Base2*` is not dereferenced. Changing to static cast could introduce breaking pointer arithmetic. * Unnecessary (but safe) pointer arithmetic could arise in a case like `Derived*` -> `Base2*` -> `Derived*` where before the Base2 pointer might not have been dereferenced. This could potentially affect performance. With some light scripting, I tried replacing pointer-to-pointer reinterpret_casts with static_cast and kept the cases that still compile. Most occurrences of reinterpret_cast have successfully been changed (except for java/ and third-party/). 294 changed, 257 remain. A couple of related interventions included here: * Previously Cache::Handle was not actually derived from in the implementations and just used as a `void*` stand-in with reinterpret_cast. Now there is a relationship to allow static_cast. In theory, this could introduce pointer arithmetic (as described above) but is unlikely without multiple inheritance AND non-empty Cache::Handle. * Remove some unnecessary casts to void* as this is allowed to be implicit (for better or worse). Most of the remaining reinterpret_casts are for converting to/from raw bytes of objects. We could consider better idioms for these patterns in follow-up work. I wish there were a way to implement a template variant of static_cast that would only compile if no pointer arithmetic is generated, but best I can tell, this is not possible. AFAIK the best you could do is a dynamic check that the void* conversion after the static cast is unchanged. Pull Request resolved: https://github.com/facebook/rocksdb/pull/12308 Test Plan: existing tests, CI Reviewed By: ltamasi Differential Revision: D53204947 Pulled By: pdillinger fbshipit-source-id: 9de23e618263b0d5b9820f4e15966876888a16e2
2024-02-07 18:44:11 +00:00
ASSERT_EQ(is_blob, static_cast<ArenaWrappedDBIter*>(iterator)->IsBlob());
};
};
auto verify = [&](int index, Status::Code expected_status,
const Slice& forward_value, const Slice& backward_value,
std::function<Iterator*()> create_iterator,
std::function<void(Iterator*)> extra_check = nullptr) {
// Seek
auto* iterator = create_iterator();
ASSERT_OK(iterator->status());
ASSERT_OK(iterator->Refresh());
iterator->Seek(get_key(index));
check_iterator(iterator, expected_status, forward_value);
if (extra_check) {
extra_check(iterator);
}
delete iterator;
// Next
iterator = create_iterator();
ASSERT_OK(iterator->Refresh());
iterator->Seek(get_key(index - 1));
ASSERT_TRUE(iterator->Valid());
ASSERT_OK(iterator->status());
iterator->Next();
check_iterator(iterator, expected_status, forward_value);
if (extra_check) {
extra_check(iterator);
}
delete iterator;
// SeekForPrev
iterator = create_iterator();
ASSERT_OK(iterator->status());
ASSERT_OK(iterator->Refresh());
iterator->SeekForPrev(get_key(index));
check_iterator(iterator, expected_status, backward_value);
if (extra_check) {
extra_check(iterator);
}
delete iterator;
// Prev
iterator = create_iterator();
iterator->Seek(get_key(index + 1));
ASSERT_TRUE(iterator->Valid());
ASSERT_OK(iterator->status());
iterator->Prev();
check_iterator(iterator, expected_status, backward_value);
if (extra_check) {
extra_check(iterator);
}
delete iterator;
};
for (auto tier : {Tier::kMemtable} /*kAllTiers*/) {
// Avoid values from being purged.
std::vector<const Snapshot*> snapshots;
DestroyAndReopen(GetTestOptions());
// fill data
for (int i = 0; i < static_cast<int>(data.size()); i++) {
for (int j = static_cast<int>(data[i].size()) - 1; j >= 0; j--) {
std::string key = get_key(i);
std::string value = get_value(i, j);
WriteBatch batch;
switch (data[i][j]) {
case kTypeValue:
ASSERT_OK(Put(key, value));
break;
case kTypeDeletion:
ASSERT_OK(Delete(key));
break;
case kTypeSingleDeletion:
ASSERT_OK(SingleDelete(key));
break;
case kTypeMerge:
ASSERT_OK(Merge(key, value));
break;
case kTypeBlobIndex:
ASSERT_OK(PutBlobIndex(&batch, key, value));
ASSERT_OK(Write(&batch));
break;
default:
FAIL();
};
}
snapshots.push_back(dbfull()->GetSnapshot());
}
ASSERT_OK(
dbfull()->DeleteRange(WriteOptions(), cfh(), get_key(15), get_key(16)));
snapshots.push_back(dbfull()->GetSnapshot());
MoveDataTo(tier);
// Normal iterator
verify(1, Status::kCorruption, "", "", create_normal_iterator);
verify(3, Status::kCorruption, "", "", create_normal_iterator);
verify(5, Status::kOk, get_value(5, 0), get_value(5, 0),
create_normal_iterator);
verify(7, Status::kOk, get_value(8, 0), get_value(6, 0),
create_normal_iterator);
verify(9, Status::kOk, get_value(10, 0), get_value(8, 0),
create_normal_iterator);
verify(11, Status::kCorruption, "", "", create_normal_iterator);
verify(13, Status::kOk,
get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
create_normal_iterator);
verify(15, Status::kOk, get_value(16, 0), get_value(14, 0),
create_normal_iterator);
// Iterator with blob support
verify(1, Status::kOk, get_value(1, 0), get_value(1, 0),
create_blob_iterator, check_is_blob(true));
verify(3, Status::kOk, get_value(3, 0), get_value(3, 0),
create_blob_iterator, check_is_blob(true));
verify(5, Status::kOk, get_value(5, 0), get_value(5, 0),
create_blob_iterator, check_is_blob(false));
verify(7, Status::kOk, get_value(8, 0), get_value(6, 0),
create_blob_iterator, check_is_blob(false));
verify(9, Status::kOk, get_value(10, 0), get_value(8, 0),
create_blob_iterator, check_is_blob(false));
if (tier <= kImmutableMemtables) {
verify(11, Status::kNotSupported, "", "", create_blob_iterator);
} else {
verify(11, Status::kCorruption, "", "", create_blob_iterator);
}
verify(13, Status::kOk,
get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
create_blob_iterator, check_is_blob(false));
verify(15, Status::kOk, get_value(16, 0), get_value(14, 0),
create_blob_iterator, check_is_blob(false));
// Iterator with blob support and using seek.
ASSERT_OK(dbfull()->SetOptions(
cfh(), {{"max_sequential_skip_in_iterations", "0"}}));
verify(1, Status::kOk, get_value(1, 0), get_value(1, 0),
create_blob_iterator, check_is_blob(true));
verify(3, Status::kOk, get_value(3, 0), get_value(3, 0),
create_blob_iterator, check_is_blob(true));
verify(5, Status::kOk, get_value(5, 0), get_value(5, 0),
create_blob_iterator, check_is_blob(false));
verify(7, Status::kOk, get_value(8, 0), get_value(6, 0),
create_blob_iterator, check_is_blob(false));
verify(9, Status::kOk, get_value(10, 0), get_value(8, 0),
create_blob_iterator, check_is_blob(false));
if (tier <= kImmutableMemtables) {
verify(11, Status::kNotSupported, "", "", create_blob_iterator);
} else {
verify(11, Status::kCorruption, "", "", create_blob_iterator);
}
verify(13, Status::kOk,
get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
create_blob_iterator, check_is_blob(false));
verify(15, Status::kOk, get_value(16, 0), get_value(14, 0),
create_blob_iterator, check_is_blob(false));
for (auto* snapshot : snapshots) {
dbfull()->ReleaseSnapshot(snapshot);
}
}
}
TEST_F(DBBlobIndexTest, IntegratedBlobIterate) {
const std::vector<std::vector<std::string>> data = {
/*00*/ {"Put"},
/*01*/ {"Put", "Merge", "Merge", "Merge"},
/*02*/ {"Put"}};
auto get_key = [](size_t index) { return ("key" + std::to_string(index)); };
auto get_value = [&](size_t index, size_t version) {
return get_key(index) + "_value" + std::to_string(version);
};
auto check_iterator = [&](Iterator* iterator, Status expected_status,
const Slice& expected_value) {
ASSERT_EQ(expected_status, iterator->status());
if (expected_status.ok()) {
ASSERT_TRUE(iterator->Valid());
ASSERT_EQ(expected_value, iterator->value());
} else {
ASSERT_FALSE(iterator->Valid());
}
};
auto verify = [&](size_t index, Status expected_status,
const Slice& expected_value) {
// Seek
{
Iterator* iterator = db_->NewIterator(ReadOptions());
std::unique_ptr<Iterator> iterator_guard(iterator);
ASSERT_OK(iterator->status());
ASSERT_OK(iterator->Refresh());
iterator->Seek(get_key(index));
check_iterator(iterator, expected_status, expected_value);
}
// Next
{
Iterator* iterator = db_->NewIterator(ReadOptions());
std::unique_ptr<Iterator> iterator_guard(iterator);
ASSERT_OK(iterator->Refresh());
iterator->Seek(get_key(index - 1));
ASSERT_TRUE(iterator->Valid());
ASSERT_OK(iterator->status());
iterator->Next();
check_iterator(iterator, expected_status, expected_value);
}
// SeekForPrev
{
Iterator* iterator = db_->NewIterator(ReadOptions());
std::unique_ptr<Iterator> iterator_guard(iterator);
ASSERT_OK(iterator->status());
ASSERT_OK(iterator->Refresh());
iterator->SeekForPrev(get_key(index));
check_iterator(iterator, expected_status, expected_value);
}
// Prev
{
Iterator* iterator = db_->NewIterator(ReadOptions());
std::unique_ptr<Iterator> iterator_guard(iterator);
iterator->Seek(get_key(index + 1));
ASSERT_TRUE(iterator->Valid());
ASSERT_OK(iterator->status());
iterator->Prev();
check_iterator(iterator, expected_status, expected_value);
}
};
Options options = GetTestOptions();
options.enable_blob_files = true;
options.min_blob_size = 0;
DestroyAndReopen(options);
// fill data
for (size_t i = 0; i < data.size(); i++) {
for (size_t j = 0; j < data[i].size(); j++) {
std::string key = get_key(i);
std::string value = get_value(i, j);
if (data[i][j] == "Put") {
ASSERT_OK(Put(key, value));
ASSERT_OK(Flush());
} else if (data[i][j] == "Merge") {
ASSERT_OK(Merge(key, value));
ASSERT_OK(Flush());
}
}
}
std::string expected_value = get_value(1, 0) + "," + get_value(1, 1) + "," +
get_value(1, 2) + "," + get_value(1, 3);
Status expected_status;
verify(1, expected_status, expected_value);
// Test DBIter::FindValueForCurrentKeyUsingSeek flow.
ASSERT_OK(dbfull()->SetOptions(cfh(),
{{"max_sequential_skip_in_iterations", "0"}}));
verify(1, expected_status, expected_value);
}
} // namespace ROCKSDB_NAMESPACE
int main(int argc, char** argv) {
ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
::testing::InitGoogleTest(&argc, argv);
RegisterCustomObjects(argc, argv);
return RUN_ALL_TESTS();
}