2016-02-09 23:12:00 +00:00
|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
2017-07-15 23:03:42 +00:00
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
2015-04-10 01:01:11 +00:00
|
|
|
|
2016-06-21 01:01:03 +00:00
|
|
|
#include "db/memtable_list.h"
|
2022-11-02 21:34:24 +00:00
|
|
|
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-28 23:34:24 +00:00
|
|
|
#include <algorithm>
|
2015-04-10 01:01:11 +00:00
|
|
|
#include <string>
|
|
|
|
#include <vector>
|
2022-11-02 21:34:24 +00:00
|
|
|
|
2015-04-10 01:01:11 +00:00
|
|
|
#include "db/merge_context.h"
|
|
|
|
#include "db/version_set.h"
|
|
|
|
#include "db/write_controller.h"
|
|
|
|
#include "rocksdb/db.h"
|
|
|
|
#include "rocksdb/status.h"
|
2016-06-21 01:01:03 +00:00
|
|
|
#include "rocksdb/write_buffer_manager.h"
|
2019-05-30 18:21:38 +00:00
|
|
|
#include "test_util/testharness.h"
|
|
|
|
#include "test_util/testutil.h"
|
2019-05-31 00:39:43 +00:00
|
|
|
#include "util/string_util.h"
|
Add initial support for TimedPut API (#12419)
Summary:
This PR adds support for `TimedPut` API. We introduced a new type `kTypeValuePreferredSeqno` for entries added to the DB via the `TimedPut` API.
The life cycle of such an entry on the write/flush/compaction paths are:
1) It is initially added to memtable as:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, write_unix_time}`
2) When it's flushed to L0 sst files, it's converted to:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, preferred_seqno}`
when we have easy access to the seqno to time mapping.
3) During compaction, if certain conditions are met, we swap in the `preferred_seqno` and the entry will become:
`<user_key, preferred_seqno, kTypeValue>: value`. This step helps fast track these entries to the cold tier if they are eligible after the sequence number swap.
On the read path:
A `kTypeValuePreferredSeqno` entry acts the same as a `kTypeValue` entry, the unix_write_time/preferred seqno part packed in value is completely ignored.
Needed follow ups:
1) The seqno to time mapping accessible in flush needs to be extended to cover the `write_unix_time` for possible `kTypeValuePreferredSeqno` entries. This also means we need to track these `write_unix_time` in memtable.
2) Compaction filter support for the new `kTypeValuePreferredSeqno` type for feature parity with other `kTypeValue` and equivalent types.
3) Stress test coverage for the feature
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12419
Test Plan: Added unit tests
Reviewed By: pdillinger
Differential Revision: D54920296
Pulled By: jowlyzhang
fbshipit-source-id: c8b43f7a7c465e569141770e93c748371ff1da9e
2024-03-14 22:44:55 +00:00
|
|
|
#include "utilities/merge_operators.h"
|
2015-04-10 01:01:11 +00:00
|
|
|
|
2020-02-20 20:07:53 +00:00
|
|
|
namespace ROCKSDB_NAMESPACE {
|
Add initial support for TimedPut API (#12419)
Summary:
This PR adds support for `TimedPut` API. We introduced a new type `kTypeValuePreferredSeqno` for entries added to the DB via the `TimedPut` API.
The life cycle of such an entry on the write/flush/compaction paths are:
1) It is initially added to memtable as:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, write_unix_time}`
2) When it's flushed to L0 sst files, it's converted to:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, preferred_seqno}`
when we have easy access to the seqno to time mapping.
3) During compaction, if certain conditions are met, we swap in the `preferred_seqno` and the entry will become:
`<user_key, preferred_seqno, kTypeValue>: value`. This step helps fast track these entries to the cold tier if they are eligible after the sequence number swap.
On the read path:
A `kTypeValuePreferredSeqno` entry acts the same as a `kTypeValue` entry, the unix_write_time/preferred seqno part packed in value is completely ignored.
Needed follow ups:
1) The seqno to time mapping accessible in flush needs to be extended to cover the `write_unix_time` for possible `kTypeValuePreferredSeqno` entries. This also means we need to track these `write_unix_time` in memtable.
2) Compaction filter support for the new `kTypeValuePreferredSeqno` type for feature parity with other `kTypeValue` and equivalent types.
3) Stress test coverage for the feature
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12419
Test Plan: Added unit tests
Reviewed By: pdillinger
Differential Revision: D54920296
Pulled By: jowlyzhang
fbshipit-source-id: c8b43f7a7c465e569141770e93c748371ff1da9e
2024-03-14 22:44:55 +00:00
|
|
|
namespace {
|
|
|
|
std::string ValueWithWriteTime(std::string value, uint64_t write_time) {
|
|
|
|
std::string result;
|
|
|
|
result = value;
|
|
|
|
PutFixed64(&result, write_time);
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
} // namespace
|
2015-04-10 01:01:11 +00:00
|
|
|
|
|
|
|
class MemTableListTest : public testing::Test {
|
|
|
|
public:
|
|
|
|
std::string dbname;
|
|
|
|
DB* db;
|
|
|
|
Options options;
|
2018-10-16 02:59:20 +00:00
|
|
|
std::vector<ColumnFamilyHandle*> handles;
|
|
|
|
std::atomic<uint64_t> file_number;
|
2015-04-10 01:01:11 +00:00
|
|
|
|
2018-10-16 02:59:20 +00:00
|
|
|
MemTableListTest() : db(nullptr), file_number(1) {
|
2018-07-14 00:18:39 +00:00
|
|
|
dbname = test::PerThreadDBPath("memtable_list_test");
|
2018-10-16 02:59:20 +00:00
|
|
|
options.create_if_missing = true;
|
2020-12-24 00:54:05 +00:00
|
|
|
EXPECT_OK(DestroyDB(dbname, options));
|
2015-04-10 01:01:11 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Create a test db if not yet created
|
|
|
|
void CreateDB() {
|
2015-04-10 05:11:35 +00:00
|
|
|
if (db == nullptr) {
|
2015-04-10 01:01:11 +00:00
|
|
|
options.create_if_missing = true;
|
2020-12-24 00:54:05 +00:00
|
|
|
EXPECT_OK(DestroyDB(dbname, options));
|
2018-10-16 02:59:20 +00:00
|
|
|
// Open DB only with default column family
|
|
|
|
ColumnFamilyOptions cf_options;
|
|
|
|
std::vector<ColumnFamilyDescriptor> cf_descs;
|
2023-07-26 23:25:06 +00:00
|
|
|
if (udt_enabled_) {
|
|
|
|
cf_options.comparator = test::BytewiseComparatorWithU64TsWrapper();
|
|
|
|
}
|
2018-10-16 02:59:20 +00:00
|
|
|
cf_descs.emplace_back(kDefaultColumnFamilyName, cf_options);
|
|
|
|
Status s = DB::Open(options, dbname, cf_descs, &handles, &db);
|
2015-04-10 01:01:11 +00:00
|
|
|
EXPECT_OK(s);
|
2018-10-16 02:59:20 +00:00
|
|
|
|
|
|
|
ColumnFamilyOptions cf_opt1, cf_opt2;
|
|
|
|
cf_opt1.cf_paths.emplace_back(dbname + "_one_1",
|
|
|
|
std::numeric_limits<uint64_t>::max());
|
|
|
|
cf_opt2.cf_paths.emplace_back(dbname + "_two_1",
|
|
|
|
std::numeric_limits<uint64_t>::max());
|
|
|
|
int sz = static_cast<int>(handles.size());
|
|
|
|
handles.resize(sz + 2);
|
|
|
|
s = db->CreateColumnFamily(cf_opt1, "one", &handles[1]);
|
|
|
|
EXPECT_OK(s);
|
|
|
|
s = db->CreateColumnFamily(cf_opt2, "two", &handles[2]);
|
|
|
|
EXPECT_OK(s);
|
|
|
|
|
|
|
|
cf_descs.emplace_back("one", cf_options);
|
|
|
|
cf_descs.emplace_back("two", cf_options);
|
2015-04-10 01:01:11 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-02-14 21:52:47 +00:00
|
|
|
~MemTableListTest() override {
|
2015-04-10 01:01:11 +00:00
|
|
|
if (db) {
|
2018-10-16 02:59:20 +00:00
|
|
|
std::vector<ColumnFamilyDescriptor> cf_descs(handles.size());
|
|
|
|
for (int i = 0; i != static_cast<int>(handles.size()); ++i) {
|
2020-09-25 04:47:43 +00:00
|
|
|
EXPECT_OK(handles[i]->GetDescriptor(&cf_descs[i]));
|
2018-10-16 02:59:20 +00:00
|
|
|
}
|
|
|
|
for (auto h : handles) {
|
|
|
|
if (h) {
|
2020-09-25 04:47:43 +00:00
|
|
|
EXPECT_OK(db->DestroyColumnFamilyHandle(h));
|
2018-10-16 02:59:20 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
handles.clear();
|
2015-04-10 01:01:11 +00:00
|
|
|
delete db;
|
2018-10-16 02:59:20 +00:00
|
|
|
db = nullptr;
|
2020-12-24 00:54:05 +00:00
|
|
|
EXPECT_OK(DestroyDB(dbname, options, cf_descs));
|
2015-04-10 01:01:11 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-10-05 22:37:45 +00:00
|
|
|
// Calls MemTableList::TryInstallMemtableFlushResults() and sets up all
|
2015-04-10 01:01:11 +00:00
|
|
|
// structures needed to call this function.
|
|
|
|
Status Mock_InstallMemtableFlushResults(
|
|
|
|
MemTableList* list, const MutableCFOptions& mutable_cf_options,
|
2024-11-05 00:09:34 +00:00
|
|
|
const autovector<ReadOnlyMemTable*>& m,
|
|
|
|
autovector<ReadOnlyMemTable*>* to_delete) {
|
2019-01-04 04:53:52 +00:00
|
|
|
// Create a mock Logger
|
|
|
|
test::NullLogger logger;
|
|
|
|
LogBuffer log_buffer(DEBUG_LEVEL, &logger);
|
|
|
|
|
|
|
|
CreateDB();
|
|
|
|
// Create a mock VersionSet
|
|
|
|
DBOptions db_options;
|
|
|
|
ImmutableDBOptions immutable_db_options(db_options);
|
|
|
|
EnvOptions env_options;
|
|
|
|
std::shared_ptr<Cache> table_cache(NewLRUCache(50000, 16));
|
|
|
|
WriteBufferManager write_buffer_manager(db_options.db_write_buffer_size);
|
|
|
|
WriteController write_controller(10000000u);
|
|
|
|
|
|
|
|
VersionSet versions(dbname, &immutable_db_options, env_options,
|
|
|
|
table_cache.get(), &write_buffer_manager,
|
2020-08-13 00:28:10 +00:00
|
|
|
&write_controller, /*block_cache_tracer=*/nullptr,
|
2023-11-11 16:11:11 +00:00
|
|
|
/*io_tracer=*/nullptr, /*db_id=*/"",
|
|
|
|
/*db_session_id=*/"", /*daily_offpeak_time_utc=*/"",
|
2023-12-29 02:25:29 +00:00
|
|
|
/*error_handler=*/nullptr, /*read_only=*/false);
|
2019-01-04 04:53:52 +00:00
|
|
|
std::vector<ColumnFamilyDescriptor> cf_descs;
|
|
|
|
cf_descs.emplace_back(kDefaultColumnFamilyName, ColumnFamilyOptions());
|
|
|
|
cf_descs.emplace_back("one", ColumnFamilyOptions());
|
|
|
|
cf_descs.emplace_back("two", ColumnFamilyOptions());
|
|
|
|
|
|
|
|
EXPECT_OK(versions.Recover(cf_descs, false));
|
|
|
|
|
|
|
|
// Create mock default ColumnFamilyData
|
|
|
|
auto column_family_set = versions.GetColumnFamilySet();
|
|
|
|
LogsWithPrepTracker dummy_prep_tracker;
|
|
|
|
auto cfd = column_family_set->GetDefault();
|
|
|
|
EXPECT_TRUE(nullptr != cfd);
|
|
|
|
uint64_t file_num = file_number.fetch_add(1);
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
2020-03-27 23:03:05 +00:00
|
|
|
IOStatus io_s;
|
2019-01-04 04:53:52 +00:00
|
|
|
// Create dummy mutex.
|
|
|
|
InstrumentedMutex mutex;
|
|
|
|
InstrumentedMutexLock l(&mutex);
|
2019-10-16 17:39:00 +00:00
|
|
|
std::list<std::unique_ptr<FlushJobInfo>> flush_jobs_info;
|
|
|
|
Status s = list->TryInstallMemtableFlushResults(
|
2019-01-04 04:53:52 +00:00
|
|
|
cfd, mutable_cf_options, m, &dummy_prep_tracker, &versions, &mutex,
|
2022-03-15 21:45:34 +00:00
|
|
|
file_num, to_delete, nullptr, &log_buffer, &flush_jobs_info);
|
2020-09-25 04:47:43 +00:00
|
|
|
EXPECT_OK(io_s);
|
2019-10-16 17:39:00 +00:00
|
|
|
return s;
|
2018-10-16 02:59:20 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Calls MemTableList::InstallMemtableFlushResults() and sets up all
|
|
|
|
// structures needed to call this function.
|
2019-01-04 04:53:52 +00:00
|
|
|
Status Mock_InstallMemtableAtomicFlushResults(
|
2018-10-16 02:59:20 +00:00
|
|
|
autovector<MemTableList*>& lists, const autovector<uint32_t>& cf_ids,
|
|
|
|
const autovector<const MutableCFOptions*>& mutable_cf_options_list,
|
2024-11-05 00:09:34 +00:00
|
|
|
const autovector<const autovector<ReadOnlyMemTable*>*>& mems_list,
|
|
|
|
autovector<ReadOnlyMemTable*>* to_delete) {
|
2015-04-10 01:01:11 +00:00
|
|
|
// Create a mock Logger
|
2015-08-05 14:33:27 +00:00
|
|
|
test::NullLogger logger;
|
2015-04-10 01:01:11 +00:00
|
|
|
LogBuffer log_buffer(DEBUG_LEVEL, &logger);
|
|
|
|
|
2018-10-16 02:59:20 +00:00
|
|
|
CreateDB();
|
2015-04-10 01:01:11 +00:00
|
|
|
// Create a mock VersionSet
|
|
|
|
DBOptions db_options;
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
|
2016-09-23 23:34:04 +00:00
|
|
|
ImmutableDBOptions immutable_db_options(db_options);
|
2015-04-10 01:01:11 +00:00
|
|
|
EnvOptions env_options;
|
2018-11-09 19:17:34 +00:00
|
|
|
std::shared_ptr<Cache> table_cache(NewLRUCache(50000, 16));
|
2016-06-21 01:01:03 +00:00
|
|
|
WriteBufferManager write_buffer_manager(db_options.db_write_buffer_size);
|
2015-05-15 22:52:51 +00:00
|
|
|
WriteController write_controller(10000000u);
|
2015-04-10 01:01:11 +00:00
|
|
|
|
2016-09-23 23:34:04 +00:00
|
|
|
VersionSet versions(dbname, &immutable_db_options, env_options,
|
|
|
|
table_cache.get(), &write_buffer_manager,
|
2020-08-13 00:28:10 +00:00
|
|
|
&write_controller, /*block_cache_tracer=*/nullptr,
|
2023-11-11 16:11:11 +00:00
|
|
|
/*io_tracer=*/nullptr, /*db_id=*/"",
|
|
|
|
/*db_session_id=*/"", /*daily_offpeak_time_utc=*/"",
|
2023-12-29 02:25:29 +00:00
|
|
|
/*error_handler=*/nullptr, /*read_only=*/false);
|
2018-10-16 02:59:20 +00:00
|
|
|
std::vector<ColumnFamilyDescriptor> cf_descs;
|
|
|
|
cf_descs.emplace_back(kDefaultColumnFamilyName, ColumnFamilyOptions());
|
|
|
|
cf_descs.emplace_back("one", ColumnFamilyOptions());
|
|
|
|
cf_descs.emplace_back("two", ColumnFamilyOptions());
|
|
|
|
EXPECT_OK(versions.Recover(cf_descs, false));
|
2015-04-10 01:01:11 +00:00
|
|
|
|
|
|
|
// Create mock default ColumnFamilyData
|
|
|
|
|
|
|
|
auto column_family_set = versions.GetColumnFamilySet();
|
|
|
|
|
2018-10-16 02:59:20 +00:00
|
|
|
LogsWithPrepTracker dummy_prep_tracker;
|
|
|
|
autovector<ColumnFamilyData*> cfds;
|
|
|
|
for (int i = 0; i != static_cast<int>(cf_ids.size()); ++i) {
|
|
|
|
cfds.emplace_back(column_family_set->GetColumnFamily(cf_ids[i]));
|
|
|
|
EXPECT_NE(nullptr, cfds[i]);
|
|
|
|
}
|
2019-01-31 22:28:53 +00:00
|
|
|
std::vector<FileMetaData> file_metas;
|
|
|
|
file_metas.reserve(cf_ids.size());
|
2018-10-16 02:59:20 +00:00
|
|
|
for (size_t i = 0; i != cf_ids.size(); ++i) {
|
|
|
|
FileMetaData meta;
|
|
|
|
uint64_t file_num = file_number.fetch_add(1);
|
|
|
|
meta.fd = FileDescriptor(file_num, 0, 0);
|
|
|
|
file_metas.emplace_back(meta);
|
|
|
|
}
|
2019-01-31 22:28:53 +00:00
|
|
|
autovector<FileMetaData*> file_meta_ptrs;
|
|
|
|
for (auto& meta : file_metas) {
|
|
|
|
file_meta_ptrs.push_back(&meta);
|
|
|
|
}
|
2021-08-03 20:30:05 +00:00
|
|
|
std::vector<std::list<std::unique_ptr<FlushJobInfo>>>
|
|
|
|
committed_flush_jobs_info_storage(cf_ids.size());
|
|
|
|
autovector<std::list<std::unique_ptr<FlushJobInfo>>*>
|
|
|
|
committed_flush_jobs_info;
|
|
|
|
for (int i = 0; i < static_cast<int>(cf_ids.size()); ++i) {
|
|
|
|
committed_flush_jobs_info.push_back(
|
|
|
|
&committed_flush_jobs_info_storage[i]);
|
|
|
|
}
|
|
|
|
|
2015-04-10 01:01:11 +00:00
|
|
|
InstrumentedMutex mutex;
|
|
|
|
InstrumentedMutexLock l(&mutex);
|
2019-01-04 04:53:52 +00:00
|
|
|
return InstallMemtableAtomicFlushResults(
|
2020-12-04 03:21:08 +00:00
|
|
|
&lists, cfds, mutable_cf_options_list, mems_list, &versions,
|
2021-08-03 20:30:05 +00:00
|
|
|
nullptr /* prep_tracker */, &mutex, file_meta_ptrs,
|
|
|
|
committed_flush_jobs_info, to_delete, nullptr, &log_buffer);
|
2015-04-10 01:01:11 +00:00
|
|
|
}
|
2023-07-26 23:25:06 +00:00
|
|
|
|
|
|
|
protected:
|
|
|
|
bool udt_enabled_ = false;
|
2015-04-10 01:01:11 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
TEST_F(MemTableListTest, Empty) {
|
|
|
|
// Create an empty MemTableList and validate basic functions.
|
Refactor trimming logic for immutable memtables (#5022)
Summary:
MyRocks currently sets `max_write_buffer_number_to_maintain` in order to maintain enough history for transaction conflict checking. The effectiveness of this approach depends on the size of memtables. When memtables are small, it may not keep enough history; when memtables are large, this may consume too much memory.
We are proposing a new way to configure memtable list history: by limiting the memory usage of immutable memtables. The new option is `max_write_buffer_size_to_maintain` and it will take precedence over the old `max_write_buffer_number_to_maintain` if they are both set to non-zero values. The new option accounts for the total memory usage of flushed immutable memtables and mutable memtable. When the total usage exceeds the limit, RocksDB may start dropping immutable memtables (which is also called trimming history), starting from the oldest one.
The semantics of the old option actually works both as an upper bound and lower bound. History trimming will start if number of immutable memtables exceeds the limit, but it will never go below (limit-1) due to history trimming.
In order the mimic the behavior with the new option, history trimming will stop if dropping the next immutable memtable causes the total memory usage go below the size limit. For example, assuming the size limit is set to 64MB, and there are 3 immutable memtables with sizes of 20, 30, 30. Although the total memory usage is 80MB > 64MB, dropping the oldest memtable will reduce the memory usage to 60MB < 64MB, so in this case no memtable will be dropped.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5022
Differential Revision: D14394062
Pulled By: miasantreble
fbshipit-source-id: 60457a509c6af89d0993f988c9b5c2aa9e45f5c5
2019-08-23 20:54:09 +00:00
|
|
|
MemTableList list(1, 0, 0);
|
2015-04-10 01:01:11 +00:00
|
|
|
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-28 23:34:24 +00:00
|
|
|
ASSERT_EQ(0, list.NumNotFlushed());
|
2015-04-10 01:01:11 +00:00
|
|
|
ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
|
|
|
|
ASSERT_FALSE(list.IsFlushPending());
|
|
|
|
|
2024-11-05 00:09:34 +00:00
|
|
|
autovector<ReadOnlyMemTable*> mems;
|
2022-05-05 20:08:21 +00:00
|
|
|
list.PickMemtablesToFlush(
|
|
|
|
std::numeric_limits<uint64_t>::max() /* memtable_id */, &mems);
|
2015-04-10 01:01:11 +00:00
|
|
|
ASSERT_EQ(0, mems.size());
|
2015-04-10 21:16:03 +00:00
|
|
|
|
2024-11-05 00:09:34 +00:00
|
|
|
autovector<ReadOnlyMemTable*> to_delete;
|
2015-04-10 21:16:03 +00:00
|
|
|
list.current()->Unref(&to_delete);
|
|
|
|
ASSERT_EQ(0, to_delete.size());
|
2015-04-10 01:01:11 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(MemTableListTest, GetTest) {
|
|
|
|
// Create MemTableList
|
|
|
|
int min_write_buffer_number_to_merge = 2;
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-28 23:34:24 +00:00
|
|
|
int max_write_buffer_number_to_maintain = 0;
|
Refactor trimming logic for immutable memtables (#5022)
Summary:
MyRocks currently sets `max_write_buffer_number_to_maintain` in order to maintain enough history for transaction conflict checking. The effectiveness of this approach depends on the size of memtables. When memtables are small, it may not keep enough history; when memtables are large, this may consume too much memory.
We are proposing a new way to configure memtable list history: by limiting the memory usage of immutable memtables. The new option is `max_write_buffer_size_to_maintain` and it will take precedence over the old `max_write_buffer_number_to_maintain` if they are both set to non-zero values. The new option accounts for the total memory usage of flushed immutable memtables and mutable memtable. When the total usage exceeds the limit, RocksDB may start dropping immutable memtables (which is also called trimming history), starting from the oldest one.
The semantics of the old option actually works both as an upper bound and lower bound. History trimming will start if number of immutable memtables exceeds the limit, but it will never go below (limit-1) due to history trimming.
In order the mimic the behavior with the new option, history trimming will stop if dropping the next immutable memtable causes the total memory usage go below the size limit. For example, assuming the size limit is set to 64MB, and there are 3 immutable memtables with sizes of 20, 30, 30. Although the total memory usage is 80MB > 64MB, dropping the oldest memtable will reduce the memory usage to 60MB < 64MB, so in this case no memtable will be dropped.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5022
Differential Revision: D14394062
Pulled By: miasantreble
fbshipit-source-id: 60457a509c6af89d0993f988c9b5c2aa9e45f5c5
2019-08-23 20:54:09 +00:00
|
|
|
int64_t max_write_buffer_size_to_maintain = 0;
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-28 23:34:24 +00:00
|
|
|
MemTableList list(min_write_buffer_number_to_merge,
|
Refactor trimming logic for immutable memtables (#5022)
Summary:
MyRocks currently sets `max_write_buffer_number_to_maintain` in order to maintain enough history for transaction conflict checking. The effectiveness of this approach depends on the size of memtables. When memtables are small, it may not keep enough history; when memtables are large, this may consume too much memory.
We are proposing a new way to configure memtable list history: by limiting the memory usage of immutable memtables. The new option is `max_write_buffer_size_to_maintain` and it will take precedence over the old `max_write_buffer_number_to_maintain` if they are both set to non-zero values. The new option accounts for the total memory usage of flushed immutable memtables and mutable memtable. When the total usage exceeds the limit, RocksDB may start dropping immutable memtables (which is also called trimming history), starting from the oldest one.
The semantics of the old option actually works both as an upper bound and lower bound. History trimming will start if number of immutable memtables exceeds the limit, but it will never go below (limit-1) due to history trimming.
In order the mimic the behavior with the new option, history trimming will stop if dropping the next immutable memtable causes the total memory usage go below the size limit. For example, assuming the size limit is set to 64MB, and there are 3 immutable memtables with sizes of 20, 30, 30. Although the total memory usage is 80MB > 64MB, dropping the oldest memtable will reduce the memory usage to 60MB < 64MB, so in this case no memtable will be dropped.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5022
Differential Revision: D14394062
Pulled By: miasantreble
fbshipit-source-id: 60457a509c6af89d0993f988c9b5c2aa9e45f5c5
2019-08-23 20:54:09 +00:00
|
|
|
max_write_buffer_number_to_maintain,
|
|
|
|
max_write_buffer_size_to_maintain);
|
2015-04-10 01:01:11 +00:00
|
|
|
|
|
|
|
SequenceNumber seq = 1;
|
|
|
|
std::string value;
|
|
|
|
Status s;
|
|
|
|
MergeContext merge_context;
|
2016-11-04 01:40:23 +00:00
|
|
|
InternalKeyComparator ikey_cmp(options.comparator);
|
Use only "local" range tombstones during Get (#4449)
Summary:
Previously, range tombstones were accumulated from every level, which
was necessary if a range tombstone in a higher level covered a key in a lower
level. However, RangeDelAggregator::AddTombstones's complexity is based on
the number of tombstones that are currently stored in it, which is wasteful in
the Get case, where we only need to know the highest sequence number of range
tombstones that cover the key from higher levels, and compute the highest covering
sequence number at the current level. This change introduces this optimization, and
removes the use of RangeDelAggregator from the Get path.
In the benchmark results, the following command was used to initialize the database:
```
./db_bench -db=/dev/shm/5k-rts -use_existing_db=false -benchmarks=filluniquerandom -write_buffer_size=1048576 -compression_type=lz4 -target_file_size_base=1048576 -max_bytes_for_level_base=4194304 -value_size=112 -key_size=16 -block_size=4096 -level_compaction_dynamic_level_bytes=true -num=5000000 -max_background_jobs=12 -benchmark_write_rate_limit=20971520 -range_tombstone_width=100 -writes_per_range_tombstone=100 -max_num_range_tombstones=50000 -bloom_bits=8
```
...and the following command was used to measure read throughput:
```
./db_bench -db=/dev/shm/5k-rts/ -use_existing_db=true -benchmarks=readrandom -disable_auto_compactions=true -num=5000000 -reads=100000 -threads=32
```
The filluniquerandom command was only run once, and the resulting database was used
to measure read performance before and after the PR. Both binaries were compiled with
`DEBUG_LEVEL=0`.
Readrandom results before PR:
```
readrandom : 4.544 micros/op 220090 ops/sec; 16.9 MB/s (63103 of 100000 found)
```
Readrandom results after PR:
```
readrandom : 11.147 micros/op 89707 ops/sec; 6.9 MB/s (63103 of 100000 found)
```
So it's actually slower right now, but this PR paves the way for future optimizations (see #4493).
----
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4449
Differential Revision: D10370575
Pulled By: abhimadan
fbshipit-source-id: 9a2e152be1ef36969055c0e9eb4beb0d96c11f4d
2018-10-24 19:29:29 +00:00
|
|
|
SequenceNumber max_covering_tombstone_seq = 0;
|
2024-11-05 00:09:34 +00:00
|
|
|
autovector<ReadOnlyMemTable*> to_delete;
|
2015-04-10 01:01:11 +00:00
|
|
|
|
|
|
|
LookupKey lkey("key1", seq);
|
Add support for wide-column point lookups (#10540)
Summary:
The patch adds a new API `GetEntity` that can be used to perform
wide-column point lookups. It also extends the `Get` code path and
the `MemTable` / `MemTableList` and `Version` / `GetContext` logic
accordingly so that wide-column entities can be served from both
memtables and SSTs. If the result of a lookup is a wide-column entity
(`kTypeWideColumnEntity`), it is passed to the application in deserialized
form; if it is a plain old key-value (`kTypeValue`), it is presented as a
wide-column entity with a single default (anonymous) column.
(In contrast, regular `Get` returns plain old key-values as-is, and
returns the value of the default column for wide-column entities, see
https://github.com/facebook/rocksdb/issues/10483 .)
The result of `GetEntity` is a self-contained `PinnableWideColumns` object.
`PinnableWideColumns` contains a `PinnableSlice`, which either stores the
underlying data in its own buffer or holds on to a cache handle. It also contains
a `WideColumns` instance, which indexes the contents of the `PinnableSlice`,
so applications can access the values of columns efficiently.
There are several pieces of functionality which are currently not supported
for wide-column entities: there is currently no `MultiGetEntity` or wide-column
iterator; also, `Merge` and `GetMergeOperands` are not supported, and there
is no `GetEntity` implementation for read-only and secondary instances.
We plan to implement these in future PRs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10540
Test Plan: `make check`
Reviewed By: akankshamahajan15
Differential Revision: D38847474
Pulled By: ltamasi
fbshipit-source-id: 42311a34ccdfe88b3775e847a5e2a5296e002b5b
2022-08-19 18:51:12 +00:00
|
|
|
bool found = list.current()->Get(lkey, &value, /*columns=*/nullptr,
|
|
|
|
/*timestamp=*/nullptr, &s, &merge_context,
|
|
|
|
&max_covering_tombstone_seq, ReadOptions());
|
2015-04-10 01:01:11 +00:00
|
|
|
ASSERT_FALSE(found);
|
|
|
|
|
|
|
|
// Create a MemTable
|
|
|
|
InternalKeyComparator cmp(BytewiseComparator());
|
|
|
|
auto factory = std::make_shared<SkipListFactory>();
|
|
|
|
options.memtable_factory = factory;
|
Add initial support for TimedPut API (#12419)
Summary:
This PR adds support for `TimedPut` API. We introduced a new type `kTypeValuePreferredSeqno` for entries added to the DB via the `TimedPut` API.
The life cycle of such an entry on the write/flush/compaction paths are:
1) It is initially added to memtable as:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, write_unix_time}`
2) When it's flushed to L0 sst files, it's converted to:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, preferred_seqno}`
when we have easy access to the seqno to time mapping.
3) During compaction, if certain conditions are met, we swap in the `preferred_seqno` and the entry will become:
`<user_key, preferred_seqno, kTypeValue>: value`. This step helps fast track these entries to the cold tier if they are eligible after the sequence number swap.
On the read path:
A `kTypeValuePreferredSeqno` entry acts the same as a `kTypeValue` entry, the unix_write_time/preferred seqno part packed in value is completely ignored.
Needed follow ups:
1) The seqno to time mapping accessible in flush needs to be extended to cover the `write_unix_time` for possible `kTypeValuePreferredSeqno` entries. This also means we need to track these `write_unix_time` in memtable.
2) Compaction filter support for the new `kTypeValuePreferredSeqno` type for feature parity with other `kTypeValue` and equivalent types.
3) Stress test coverage for the feature
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12419
Test Plan: Added unit tests
Reviewed By: pdillinger
Differential Revision: D54920296
Pulled By: jowlyzhang
fbshipit-source-id: c8b43f7a7c465e569141770e93c748371ff1da9e
2024-03-14 22:44:55 +00:00
|
|
|
options.merge_operator = MergeOperators::CreateStringAppendOperator();
|
2021-05-05 20:59:21 +00:00
|
|
|
ImmutableOptions ioptions(options);
|
2015-04-10 01:01:11 +00:00
|
|
|
|
2016-06-21 01:01:03 +00:00
|
|
|
WriteBufferManager wb(options.db_write_buffer_size);
|
2016-09-14 04:11:59 +00:00
|
|
|
MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb,
|
2017-06-02 19:08:01 +00:00
|
|
|
kMaxSequenceNumber, 0 /* column_family_id */);
|
2015-04-10 01:01:11 +00:00
|
|
|
mem->Ref();
|
|
|
|
|
|
|
|
// Write some keys to this memtable.
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
ASSERT_OK(
|
|
|
|
mem->Add(++seq, kTypeDeletion, "key1", "", nullptr /* kv_prot_info */));
|
|
|
|
ASSERT_OK(mem->Add(++seq, kTypeValue, "key2", "value2",
|
|
|
|
nullptr /* kv_prot_info */));
|
|
|
|
ASSERT_OK(mem->Add(++seq, kTypeValue, "key1", "value1",
|
|
|
|
nullptr /* kv_prot_info */));
|
|
|
|
ASSERT_OK(mem->Add(++seq, kTypeValue, "key2", "value2.2",
|
|
|
|
nullptr /* kv_prot_info */));
|
Add initial support for TimedPut API (#12419)
Summary:
This PR adds support for `TimedPut` API. We introduced a new type `kTypeValuePreferredSeqno` for entries added to the DB via the `TimedPut` API.
The life cycle of such an entry on the write/flush/compaction paths are:
1) It is initially added to memtable as:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, write_unix_time}`
2) When it's flushed to L0 sst files, it's converted to:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, preferred_seqno}`
when we have easy access to the seqno to time mapping.
3) During compaction, if certain conditions are met, we swap in the `preferred_seqno` and the entry will become:
`<user_key, preferred_seqno, kTypeValue>: value`. This step helps fast track these entries to the cold tier if they are eligible after the sequence number swap.
On the read path:
A `kTypeValuePreferredSeqno` entry acts the same as a `kTypeValue` entry, the unix_write_time/preferred seqno part packed in value is completely ignored.
Needed follow ups:
1) The seqno to time mapping accessible in flush needs to be extended to cover the `write_unix_time` for possible `kTypeValuePreferredSeqno` entries. This also means we need to track these `write_unix_time` in memtable.
2) Compaction filter support for the new `kTypeValuePreferredSeqno` type for feature parity with other `kTypeValue` and equivalent types.
3) Stress test coverage for the feature
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12419
Test Plan: Added unit tests
Reviewed By: pdillinger
Differential Revision: D54920296
Pulled By: jowlyzhang
fbshipit-source-id: c8b43f7a7c465e569141770e93c748371ff1da9e
2024-03-14 22:44:55 +00:00
|
|
|
ASSERT_OK(mem->Add(++seq, kTypeValuePreferredSeqno, "key3",
|
|
|
|
ValueWithWriteTime("value3.1", 20),
|
|
|
|
nullptr /* kv_prot_info */));
|
2015-04-10 01:01:11 +00:00
|
|
|
|
|
|
|
// Fetch the newly written keys
|
|
|
|
merge_context.Clear();
|
2024-08-08 20:34:11 +00:00
|
|
|
s = Status::OK();
|
Add support for wide-column point lookups (#10540)
Summary:
The patch adds a new API `GetEntity` that can be used to perform
wide-column point lookups. It also extends the `Get` code path and
the `MemTable` / `MemTableList` and `Version` / `GetContext` logic
accordingly so that wide-column entities can be served from both
memtables and SSTs. If the result of a lookup is a wide-column entity
(`kTypeWideColumnEntity`), it is passed to the application in deserialized
form; if it is a plain old key-value (`kTypeValue`), it is presented as a
wide-column entity with a single default (anonymous) column.
(In contrast, regular `Get` returns plain old key-values as-is, and
returns the value of the default column for wide-column entities, see
https://github.com/facebook/rocksdb/issues/10483 .)
The result of `GetEntity` is a self-contained `PinnableWideColumns` object.
`PinnableWideColumns` contains a `PinnableSlice`, which either stores the
underlying data in its own buffer or holds on to a cache handle. It also contains
a `WideColumns` instance, which indexes the contents of the `PinnableSlice`,
so applications can access the values of columns efficiently.
There are several pieces of functionality which are currently not supported
for wide-column entities: there is currently no `MultiGetEntity` or wide-column
iterator; also, `Merge` and `GetMergeOperands` are not supported, and there
is no `GetEntity` implementation for read-only and secondary instances.
We plan to implement these in future PRs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10540
Test Plan: `make check`
Reviewed By: akankshamahajan15
Differential Revision: D38847474
Pulled By: ltamasi
fbshipit-source-id: 42311a34ccdfe88b3775e847a5e2a5296e002b5b
2022-08-19 18:51:12 +00:00
|
|
|
found = mem->Get(LookupKey("key1", seq), &value, /*columns*/ nullptr,
|
2022-08-05 19:02:33 +00:00
|
|
|
/*timestamp*/ nullptr, &s, &merge_context,
|
|
|
|
&max_covering_tombstone_seq, ReadOptions(),
|
|
|
|
false /* immutable_memtable */);
|
2015-04-10 01:01:11 +00:00
|
|
|
ASSERT_TRUE(s.ok() && found);
|
|
|
|
ASSERT_EQ(value, "value1");
|
|
|
|
|
|
|
|
merge_context.Clear();
|
2024-08-08 20:34:11 +00:00
|
|
|
s = Status::OK();
|
Add support for wide-column point lookups (#10540)
Summary:
The patch adds a new API `GetEntity` that can be used to perform
wide-column point lookups. It also extends the `Get` code path and
the `MemTable` / `MemTableList` and `Version` / `GetContext` logic
accordingly so that wide-column entities can be served from both
memtables and SSTs. If the result of a lookup is a wide-column entity
(`kTypeWideColumnEntity`), it is passed to the application in deserialized
form; if it is a plain old key-value (`kTypeValue`), it is presented as a
wide-column entity with a single default (anonymous) column.
(In contrast, regular `Get` returns plain old key-values as-is, and
returns the value of the default column for wide-column entities, see
https://github.com/facebook/rocksdb/issues/10483 .)
The result of `GetEntity` is a self-contained `PinnableWideColumns` object.
`PinnableWideColumns` contains a `PinnableSlice`, which either stores the
underlying data in its own buffer or holds on to a cache handle. It also contains
a `WideColumns` instance, which indexes the contents of the `PinnableSlice`,
so applications can access the values of columns efficiently.
There are several pieces of functionality which are currently not supported
for wide-column entities: there is currently no `MultiGetEntity` or wide-column
iterator; also, `Merge` and `GetMergeOperands` are not supported, and there
is no `GetEntity` implementation for read-only and secondary instances.
We plan to implement these in future PRs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10540
Test Plan: `make check`
Reviewed By: akankshamahajan15
Differential Revision: D38847474
Pulled By: ltamasi
fbshipit-source-id: 42311a34ccdfe88b3775e847a5e2a5296e002b5b
2022-08-19 18:51:12 +00:00
|
|
|
found = mem->Get(LookupKey("key1", 2), &value, /*columns*/ nullptr,
|
2022-08-05 19:02:33 +00:00
|
|
|
/*timestamp*/ nullptr, &s, &merge_context,
|
|
|
|
&max_covering_tombstone_seq, ReadOptions(),
|
|
|
|
false /* immutable_memtable */);
|
2015-04-10 01:01:11 +00:00
|
|
|
// MemTable found out that this key is *not* found (at this sequence#)
|
|
|
|
ASSERT_TRUE(found && s.IsNotFound());
|
|
|
|
|
|
|
|
merge_context.Clear();
|
2024-08-08 20:34:11 +00:00
|
|
|
s = Status::OK();
|
Add support for wide-column point lookups (#10540)
Summary:
The patch adds a new API `GetEntity` that can be used to perform
wide-column point lookups. It also extends the `Get` code path and
the `MemTable` / `MemTableList` and `Version` / `GetContext` logic
accordingly so that wide-column entities can be served from both
memtables and SSTs. If the result of a lookup is a wide-column entity
(`kTypeWideColumnEntity`), it is passed to the application in deserialized
form; if it is a plain old key-value (`kTypeValue`), it is presented as a
wide-column entity with a single default (anonymous) column.
(In contrast, regular `Get` returns plain old key-values as-is, and
returns the value of the default column for wide-column entities, see
https://github.com/facebook/rocksdb/issues/10483 .)
The result of `GetEntity` is a self-contained `PinnableWideColumns` object.
`PinnableWideColumns` contains a `PinnableSlice`, which either stores the
underlying data in its own buffer or holds on to a cache handle. It also contains
a `WideColumns` instance, which indexes the contents of the `PinnableSlice`,
so applications can access the values of columns efficiently.
There are several pieces of functionality which are currently not supported
for wide-column entities: there is currently no `MultiGetEntity` or wide-column
iterator; also, `Merge` and `GetMergeOperands` are not supported, and there
is no `GetEntity` implementation for read-only and secondary instances.
We plan to implement these in future PRs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10540
Test Plan: `make check`
Reviewed By: akankshamahajan15
Differential Revision: D38847474
Pulled By: ltamasi
fbshipit-source-id: 42311a34ccdfe88b3775e847a5e2a5296e002b5b
2022-08-19 18:51:12 +00:00
|
|
|
found = mem->Get(LookupKey("key2", seq), &value, /*columns*/ nullptr,
|
2022-08-05 19:02:33 +00:00
|
|
|
/*timestamp*/ nullptr, &s, &merge_context,
|
|
|
|
&max_covering_tombstone_seq, ReadOptions(),
|
|
|
|
false /* immutable_memtable */);
|
2015-04-10 01:01:11 +00:00
|
|
|
ASSERT_TRUE(s.ok() && found);
|
|
|
|
ASSERT_EQ(value, "value2.2");
|
|
|
|
|
Add initial support for TimedPut API (#12419)
Summary:
This PR adds support for `TimedPut` API. We introduced a new type `kTypeValuePreferredSeqno` for entries added to the DB via the `TimedPut` API.
The life cycle of such an entry on the write/flush/compaction paths are:
1) It is initially added to memtable as:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, write_unix_time}`
2) When it's flushed to L0 sst files, it's converted to:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, preferred_seqno}`
when we have easy access to the seqno to time mapping.
3) During compaction, if certain conditions are met, we swap in the `preferred_seqno` and the entry will become:
`<user_key, preferred_seqno, kTypeValue>: value`. This step helps fast track these entries to the cold tier if they are eligible after the sequence number swap.
On the read path:
A `kTypeValuePreferredSeqno` entry acts the same as a `kTypeValue` entry, the unix_write_time/preferred seqno part packed in value is completely ignored.
Needed follow ups:
1) The seqno to time mapping accessible in flush needs to be extended to cover the `write_unix_time` for possible `kTypeValuePreferredSeqno` entries. This also means we need to track these `write_unix_time` in memtable.
2) Compaction filter support for the new `kTypeValuePreferredSeqno` type for feature parity with other `kTypeValue` and equivalent types.
3) Stress test coverage for the feature
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12419
Test Plan: Added unit tests
Reviewed By: pdillinger
Differential Revision: D54920296
Pulled By: jowlyzhang
fbshipit-source-id: c8b43f7a7c465e569141770e93c748371ff1da9e
2024-03-14 22:44:55 +00:00
|
|
|
merge_context.Clear();
|
2024-08-08 20:34:11 +00:00
|
|
|
s = Status::OK();
|
Add initial support for TimedPut API (#12419)
Summary:
This PR adds support for `TimedPut` API. We introduced a new type `kTypeValuePreferredSeqno` for entries added to the DB via the `TimedPut` API.
The life cycle of such an entry on the write/flush/compaction paths are:
1) It is initially added to memtable as:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, write_unix_time}`
2) When it's flushed to L0 sst files, it's converted to:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, preferred_seqno}`
when we have easy access to the seqno to time mapping.
3) During compaction, if certain conditions are met, we swap in the `preferred_seqno` and the entry will become:
`<user_key, preferred_seqno, kTypeValue>: value`. This step helps fast track these entries to the cold tier if they are eligible after the sequence number swap.
On the read path:
A `kTypeValuePreferredSeqno` entry acts the same as a `kTypeValue` entry, the unix_write_time/preferred seqno part packed in value is completely ignored.
Needed follow ups:
1) The seqno to time mapping accessible in flush needs to be extended to cover the `write_unix_time` for possible `kTypeValuePreferredSeqno` entries. This also means we need to track these `write_unix_time` in memtable.
2) Compaction filter support for the new `kTypeValuePreferredSeqno` type for feature parity with other `kTypeValue` and equivalent types.
3) Stress test coverage for the feature
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12419
Test Plan: Added unit tests
Reviewed By: pdillinger
Differential Revision: D54920296
Pulled By: jowlyzhang
fbshipit-source-id: c8b43f7a7c465e569141770e93c748371ff1da9e
2024-03-14 22:44:55 +00:00
|
|
|
found = mem->Get(LookupKey("key3", seq), &value, /*columns*/ nullptr,
|
|
|
|
/*timestamp*/ nullptr, &s, &merge_context,
|
|
|
|
&max_covering_tombstone_seq, ReadOptions(),
|
|
|
|
false /* immutable_memtable */);
|
|
|
|
ASSERT_TRUE(s.ok() && found);
|
|
|
|
ASSERT_EQ(value, "value3.1");
|
|
|
|
|
2024-11-05 00:09:34 +00:00
|
|
|
ASSERT_EQ(5, mem->NumEntries());
|
|
|
|
ASSERT_EQ(1, mem->NumDeletion());
|
2015-04-10 01:01:11 +00:00
|
|
|
|
|
|
|
// Add memtable to list
|
2022-08-05 19:02:33 +00:00
|
|
|
// This is to make assert(memtable->IsFragmentedRangeTombstonesConstructed())
|
|
|
|
// in MemTableListVersion::GetFromList work.
|
|
|
|
mem->ConstructFragmentedRangeTombstones();
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-28 23:34:24 +00:00
|
|
|
list.Add(mem, &to_delete);
|
2015-04-10 01:01:11 +00:00
|
|
|
|
|
|
|
SequenceNumber saved_seq = seq;
|
|
|
|
|
|
|
|
// Create another memtable and write some keys to it
|
2016-06-21 01:01:03 +00:00
|
|
|
WriteBufferManager wb2(options.db_write_buffer_size);
|
2016-09-14 04:11:59 +00:00
|
|
|
MemTable* mem2 = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb2,
|
2017-06-02 19:08:01 +00:00
|
|
|
kMaxSequenceNumber, 0 /* column_family_id */);
|
2015-04-10 01:01:11 +00:00
|
|
|
mem2->Ref();
|
|
|
|
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
ASSERT_OK(
|
|
|
|
mem2->Add(++seq, kTypeDeletion, "key1", "", nullptr /* kv_prot_info */));
|
|
|
|
ASSERT_OK(mem2->Add(++seq, kTypeValue, "key2", "value2.3",
|
|
|
|
nullptr /* kv_prot_info */));
|
Add initial support for TimedPut API (#12419)
Summary:
This PR adds support for `TimedPut` API. We introduced a new type `kTypeValuePreferredSeqno` for entries added to the DB via the `TimedPut` API.
The life cycle of such an entry on the write/flush/compaction paths are:
1) It is initially added to memtable as:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, write_unix_time}`
2) When it's flushed to L0 sst files, it's converted to:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, preferred_seqno}`
when we have easy access to the seqno to time mapping.
3) During compaction, if certain conditions are met, we swap in the `preferred_seqno` and the entry will become:
`<user_key, preferred_seqno, kTypeValue>: value`. This step helps fast track these entries to the cold tier if they are eligible after the sequence number swap.
On the read path:
A `kTypeValuePreferredSeqno` entry acts the same as a `kTypeValue` entry, the unix_write_time/preferred seqno part packed in value is completely ignored.
Needed follow ups:
1) The seqno to time mapping accessible in flush needs to be extended to cover the `write_unix_time` for possible `kTypeValuePreferredSeqno` entries. This also means we need to track these `write_unix_time` in memtable.
2) Compaction filter support for the new `kTypeValuePreferredSeqno` type for feature parity with other `kTypeValue` and equivalent types.
3) Stress test coverage for the feature
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12419
Test Plan: Added unit tests
Reviewed By: pdillinger
Differential Revision: D54920296
Pulled By: jowlyzhang
fbshipit-source-id: c8b43f7a7c465e569141770e93c748371ff1da9e
2024-03-14 22:44:55 +00:00
|
|
|
ASSERT_OK(mem2->Add(++seq, kTypeMerge, "key3", "value3.2",
|
|
|
|
nullptr /* kv_prot_info */));
|
2015-04-10 01:01:11 +00:00
|
|
|
|
|
|
|
// Add second memtable to list
|
2022-08-05 19:02:33 +00:00
|
|
|
// This is to make assert(memtable->IsFragmentedRangeTombstonesConstructed())
|
|
|
|
// in MemTableListVersion::GetFromList work.
|
|
|
|
mem2->ConstructFragmentedRangeTombstones();
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-28 23:34:24 +00:00
|
|
|
list.Add(mem2, &to_delete);
|
2015-04-10 01:01:11 +00:00
|
|
|
|
|
|
|
// Fetch keys via MemTableList
|
|
|
|
merge_context.Clear();
|
2024-08-08 20:34:11 +00:00
|
|
|
s = Status::OK();
|
Add support for wide-column point lookups (#10540)
Summary:
The patch adds a new API `GetEntity` that can be used to perform
wide-column point lookups. It also extends the `Get` code path and
the `MemTable` / `MemTableList` and `Version` / `GetContext` logic
accordingly so that wide-column entities can be served from both
memtables and SSTs. If the result of a lookup is a wide-column entity
(`kTypeWideColumnEntity`), it is passed to the application in deserialized
form; if it is a plain old key-value (`kTypeValue`), it is presented as a
wide-column entity with a single default (anonymous) column.
(In contrast, regular `Get` returns plain old key-values as-is, and
returns the value of the default column for wide-column entities, see
https://github.com/facebook/rocksdb/issues/10483 .)
The result of `GetEntity` is a self-contained `PinnableWideColumns` object.
`PinnableWideColumns` contains a `PinnableSlice`, which either stores the
underlying data in its own buffer or holds on to a cache handle. It also contains
a `WideColumns` instance, which indexes the contents of the `PinnableSlice`,
so applications can access the values of columns efficiently.
There are several pieces of functionality which are currently not supported
for wide-column entities: there is currently no `MultiGetEntity` or wide-column
iterator; also, `Merge` and `GetMergeOperands` are not supported, and there
is no `GetEntity` implementation for read-only and secondary instances.
We plan to implement these in future PRs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10540
Test Plan: `make check`
Reviewed By: akankshamahajan15
Differential Revision: D38847474
Pulled By: ltamasi
fbshipit-source-id: 42311a34ccdfe88b3775e847a5e2a5296e002b5b
2022-08-19 18:51:12 +00:00
|
|
|
found =
|
|
|
|
list.current()->Get(LookupKey("key1", seq), &value, /*columns=*/nullptr,
|
|
|
|
/*timestamp=*/nullptr, &s, &merge_context,
|
|
|
|
&max_covering_tombstone_seq, ReadOptions());
|
2015-04-10 01:01:11 +00:00
|
|
|
ASSERT_TRUE(found && s.IsNotFound());
|
|
|
|
|
|
|
|
merge_context.Clear();
|
2024-08-08 20:34:11 +00:00
|
|
|
s = Status::OK();
|
Add support for wide-column point lookups (#10540)
Summary:
The patch adds a new API `GetEntity` that can be used to perform
wide-column point lookups. It also extends the `Get` code path and
the `MemTable` / `MemTableList` and `Version` / `GetContext` logic
accordingly so that wide-column entities can be served from both
memtables and SSTs. If the result of a lookup is a wide-column entity
(`kTypeWideColumnEntity`), it is passed to the application in deserialized
form; if it is a plain old key-value (`kTypeValue`), it is presented as a
wide-column entity with a single default (anonymous) column.
(In contrast, regular `Get` returns plain old key-values as-is, and
returns the value of the default column for wide-column entities, see
https://github.com/facebook/rocksdb/issues/10483 .)
The result of `GetEntity` is a self-contained `PinnableWideColumns` object.
`PinnableWideColumns` contains a `PinnableSlice`, which either stores the
underlying data in its own buffer or holds on to a cache handle. It also contains
a `WideColumns` instance, which indexes the contents of the `PinnableSlice`,
so applications can access the values of columns efficiently.
There are several pieces of functionality which are currently not supported
for wide-column entities: there is currently no `MultiGetEntity` or wide-column
iterator; also, `Merge` and `GetMergeOperands` are not supported, and there
is no `GetEntity` implementation for read-only and secondary instances.
We plan to implement these in future PRs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10540
Test Plan: `make check`
Reviewed By: akankshamahajan15
Differential Revision: D38847474
Pulled By: ltamasi
fbshipit-source-id: 42311a34ccdfe88b3775e847a5e2a5296e002b5b
2022-08-19 18:51:12 +00:00
|
|
|
found = list.current()->Get(LookupKey("key1", saved_seq), &value,
|
|
|
|
/*columns=*/nullptr, /*timestamp=*/nullptr, &s,
|
|
|
|
&merge_context, &max_covering_tombstone_seq,
|
|
|
|
ReadOptions());
|
2015-04-10 01:01:11 +00:00
|
|
|
ASSERT_TRUE(s.ok() && found);
|
|
|
|
ASSERT_EQ("value1", value);
|
|
|
|
|
|
|
|
merge_context.Clear();
|
2024-08-08 20:34:11 +00:00
|
|
|
s = Status::OK();
|
Add support for wide-column point lookups (#10540)
Summary:
The patch adds a new API `GetEntity` that can be used to perform
wide-column point lookups. It also extends the `Get` code path and
the `MemTable` / `MemTableList` and `Version` / `GetContext` logic
accordingly so that wide-column entities can be served from both
memtables and SSTs. If the result of a lookup is a wide-column entity
(`kTypeWideColumnEntity`), it is passed to the application in deserialized
form; if it is a plain old key-value (`kTypeValue`), it is presented as a
wide-column entity with a single default (anonymous) column.
(In contrast, regular `Get` returns plain old key-values as-is, and
returns the value of the default column for wide-column entities, see
https://github.com/facebook/rocksdb/issues/10483 .)
The result of `GetEntity` is a self-contained `PinnableWideColumns` object.
`PinnableWideColumns` contains a `PinnableSlice`, which either stores the
underlying data in its own buffer or holds on to a cache handle. It also contains
a `WideColumns` instance, which indexes the contents of the `PinnableSlice`,
so applications can access the values of columns efficiently.
There are several pieces of functionality which are currently not supported
for wide-column entities: there is currently no `MultiGetEntity` or wide-column
iterator; also, `Merge` and `GetMergeOperands` are not supported, and there
is no `GetEntity` implementation for read-only and secondary instances.
We plan to implement these in future PRs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10540
Test Plan: `make check`
Reviewed By: akankshamahajan15
Differential Revision: D38847474
Pulled By: ltamasi
fbshipit-source-id: 42311a34ccdfe88b3775e847a5e2a5296e002b5b
2022-08-19 18:51:12 +00:00
|
|
|
found =
|
|
|
|
list.current()->Get(LookupKey("key2", seq), &value, /*columns=*/nullptr,
|
|
|
|
/*timestamp=*/nullptr, &s, &merge_context,
|
|
|
|
&max_covering_tombstone_seq, ReadOptions());
|
2015-04-10 01:01:11 +00:00
|
|
|
ASSERT_TRUE(s.ok() && found);
|
|
|
|
ASSERT_EQ(value, "value2.3");
|
|
|
|
|
|
|
|
merge_context.Clear();
|
2024-08-08 20:34:11 +00:00
|
|
|
s = Status::OK();
|
Add support for wide-column point lookups (#10540)
Summary:
The patch adds a new API `GetEntity` that can be used to perform
wide-column point lookups. It also extends the `Get` code path and
the `MemTable` / `MemTableList` and `Version` / `GetContext` logic
accordingly so that wide-column entities can be served from both
memtables and SSTs. If the result of a lookup is a wide-column entity
(`kTypeWideColumnEntity`), it is passed to the application in deserialized
form; if it is a plain old key-value (`kTypeValue`), it is presented as a
wide-column entity with a single default (anonymous) column.
(In contrast, regular `Get` returns plain old key-values as-is, and
returns the value of the default column for wide-column entities, see
https://github.com/facebook/rocksdb/issues/10483 .)
The result of `GetEntity` is a self-contained `PinnableWideColumns` object.
`PinnableWideColumns` contains a `PinnableSlice`, which either stores the
underlying data in its own buffer or holds on to a cache handle. It also contains
a `WideColumns` instance, which indexes the contents of the `PinnableSlice`,
so applications can access the values of columns efficiently.
There are several pieces of functionality which are currently not supported
for wide-column entities: there is currently no `MultiGetEntity` or wide-column
iterator; also, `Merge` and `GetMergeOperands` are not supported, and there
is no `GetEntity` implementation for read-only and secondary instances.
We plan to implement these in future PRs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10540
Test Plan: `make check`
Reviewed By: akankshamahajan15
Differential Revision: D38847474
Pulled By: ltamasi
fbshipit-source-id: 42311a34ccdfe88b3775e847a5e2a5296e002b5b
2022-08-19 18:51:12 +00:00
|
|
|
found = list.current()->Get(LookupKey("key2", 1), &value, /*columns=*/nullptr,
|
|
|
|
/*timestamp=*/nullptr, &s, &merge_context,
|
|
|
|
&max_covering_tombstone_seq, ReadOptions());
|
2015-04-10 01:01:11 +00:00
|
|
|
ASSERT_FALSE(found);
|
|
|
|
|
Add initial support for TimedPut API (#12419)
Summary:
This PR adds support for `TimedPut` API. We introduced a new type `kTypeValuePreferredSeqno` for entries added to the DB via the `TimedPut` API.
The life cycle of such an entry on the write/flush/compaction paths are:
1) It is initially added to memtable as:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, write_unix_time}`
2) When it's flushed to L0 sst files, it's converted to:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, preferred_seqno}`
when we have easy access to the seqno to time mapping.
3) During compaction, if certain conditions are met, we swap in the `preferred_seqno` and the entry will become:
`<user_key, preferred_seqno, kTypeValue>: value`. This step helps fast track these entries to the cold tier if they are eligible after the sequence number swap.
On the read path:
A `kTypeValuePreferredSeqno` entry acts the same as a `kTypeValue` entry, the unix_write_time/preferred seqno part packed in value is completely ignored.
Needed follow ups:
1) The seqno to time mapping accessible in flush needs to be extended to cover the `write_unix_time` for possible `kTypeValuePreferredSeqno` entries. This also means we need to track these `write_unix_time` in memtable.
2) Compaction filter support for the new `kTypeValuePreferredSeqno` type for feature parity with other `kTypeValue` and equivalent types.
3) Stress test coverage for the feature
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12419
Test Plan: Added unit tests
Reviewed By: pdillinger
Differential Revision: D54920296
Pulled By: jowlyzhang
fbshipit-source-id: c8b43f7a7c465e569141770e93c748371ff1da9e
2024-03-14 22:44:55 +00:00
|
|
|
merge_context.Clear();
|
2024-08-08 20:34:11 +00:00
|
|
|
s = Status::OK();
|
Add initial support for TimedPut API (#12419)
Summary:
This PR adds support for `TimedPut` API. We introduced a new type `kTypeValuePreferredSeqno` for entries added to the DB via the `TimedPut` API.
The life cycle of such an entry on the write/flush/compaction paths are:
1) It is initially added to memtable as:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, write_unix_time}`
2) When it's flushed to L0 sst files, it's converted to:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, preferred_seqno}`
when we have easy access to the seqno to time mapping.
3) During compaction, if certain conditions are met, we swap in the `preferred_seqno` and the entry will become:
`<user_key, preferred_seqno, kTypeValue>: value`. This step helps fast track these entries to the cold tier if they are eligible after the sequence number swap.
On the read path:
A `kTypeValuePreferredSeqno` entry acts the same as a `kTypeValue` entry, the unix_write_time/preferred seqno part packed in value is completely ignored.
Needed follow ups:
1) The seqno to time mapping accessible in flush needs to be extended to cover the `write_unix_time` for possible `kTypeValuePreferredSeqno` entries. This also means we need to track these `write_unix_time` in memtable.
2) Compaction filter support for the new `kTypeValuePreferredSeqno` type for feature parity with other `kTypeValue` and equivalent types.
3) Stress test coverage for the feature
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12419
Test Plan: Added unit tests
Reviewed By: pdillinger
Differential Revision: D54920296
Pulled By: jowlyzhang
fbshipit-source-id: c8b43f7a7c465e569141770e93c748371ff1da9e
2024-03-14 22:44:55 +00:00
|
|
|
found =
|
|
|
|
list.current()->Get(LookupKey("key3", seq), &value, /*columns=*/nullptr,
|
|
|
|
/*timestamp=*/nullptr, &s, &merge_context,
|
|
|
|
&max_covering_tombstone_seq, ReadOptions());
|
|
|
|
ASSERT_TRUE(s.ok() && found);
|
|
|
|
ASSERT_EQ(value, "value3.1,value3.2");
|
|
|
|
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-28 23:34:24 +00:00
|
|
|
ASSERT_EQ(2, list.NumNotFlushed());
|
|
|
|
|
|
|
|
list.current()->Unref(&to_delete);
|
2024-11-05 00:09:34 +00:00
|
|
|
for (ReadOnlyMemTable* m : to_delete) {
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-28 23:34:24 +00:00
|
|
|
delete m;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(MemTableListTest, GetFromHistoryTest) {
|
|
|
|
// Create MemTableList
|
|
|
|
int min_write_buffer_number_to_merge = 2;
|
|
|
|
int max_write_buffer_number_to_maintain = 2;
|
2021-12-02 19:44:29 +00:00
|
|
|
int64_t max_write_buffer_size_to_maintain = 2 * Arena::kInlineSize;
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-28 23:34:24 +00:00
|
|
|
MemTableList list(min_write_buffer_number_to_merge,
|
Refactor trimming logic for immutable memtables (#5022)
Summary:
MyRocks currently sets `max_write_buffer_number_to_maintain` in order to maintain enough history for transaction conflict checking. The effectiveness of this approach depends on the size of memtables. When memtables are small, it may not keep enough history; when memtables are large, this may consume too much memory.
We are proposing a new way to configure memtable list history: by limiting the memory usage of immutable memtables. The new option is `max_write_buffer_size_to_maintain` and it will take precedence over the old `max_write_buffer_number_to_maintain` if they are both set to non-zero values. The new option accounts for the total memory usage of flushed immutable memtables and mutable memtable. When the total usage exceeds the limit, RocksDB may start dropping immutable memtables (which is also called trimming history), starting from the oldest one.
The semantics of the old option actually works both as an upper bound and lower bound. History trimming will start if number of immutable memtables exceeds the limit, but it will never go below (limit-1) due to history trimming.
In order the mimic the behavior with the new option, history trimming will stop if dropping the next immutable memtable causes the total memory usage go below the size limit. For example, assuming the size limit is set to 64MB, and there are 3 immutable memtables with sizes of 20, 30, 30. Although the total memory usage is 80MB > 64MB, dropping the oldest memtable will reduce the memory usage to 60MB < 64MB, so in this case no memtable will be dropped.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5022
Differential Revision: D14394062
Pulled By: miasantreble
fbshipit-source-id: 60457a509c6af89d0993f988c9b5c2aa9e45f5c5
2019-08-23 20:54:09 +00:00
|
|
|
max_write_buffer_number_to_maintain,
|
|
|
|
max_write_buffer_size_to_maintain);
|
2015-04-10 01:01:11 +00:00
|
|
|
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-28 23:34:24 +00:00
|
|
|
SequenceNumber seq = 1;
|
|
|
|
std::string value;
|
|
|
|
Status s;
|
|
|
|
MergeContext merge_context;
|
2016-11-04 01:40:23 +00:00
|
|
|
InternalKeyComparator ikey_cmp(options.comparator);
|
Use only "local" range tombstones during Get (#4449)
Summary:
Previously, range tombstones were accumulated from every level, which
was necessary if a range tombstone in a higher level covered a key in a lower
level. However, RangeDelAggregator::AddTombstones's complexity is based on
the number of tombstones that are currently stored in it, which is wasteful in
the Get case, where we only need to know the highest sequence number of range
tombstones that cover the key from higher levels, and compute the highest covering
sequence number at the current level. This change introduces this optimization, and
removes the use of RangeDelAggregator from the Get path.
In the benchmark results, the following command was used to initialize the database:
```
./db_bench -db=/dev/shm/5k-rts -use_existing_db=false -benchmarks=filluniquerandom -write_buffer_size=1048576 -compression_type=lz4 -target_file_size_base=1048576 -max_bytes_for_level_base=4194304 -value_size=112 -key_size=16 -block_size=4096 -level_compaction_dynamic_level_bytes=true -num=5000000 -max_background_jobs=12 -benchmark_write_rate_limit=20971520 -range_tombstone_width=100 -writes_per_range_tombstone=100 -max_num_range_tombstones=50000 -bloom_bits=8
```
...and the following command was used to measure read throughput:
```
./db_bench -db=/dev/shm/5k-rts/ -use_existing_db=true -benchmarks=readrandom -disable_auto_compactions=true -num=5000000 -reads=100000 -threads=32
```
The filluniquerandom command was only run once, and the resulting database was used
to measure read performance before and after the PR. Both binaries were compiled with
`DEBUG_LEVEL=0`.
Readrandom results before PR:
```
readrandom : 4.544 micros/op 220090 ops/sec; 16.9 MB/s (63103 of 100000 found)
```
Readrandom results after PR:
```
readrandom : 11.147 micros/op 89707 ops/sec; 6.9 MB/s (63103 of 100000 found)
```
So it's actually slower right now, but this PR paves the way for future optimizations (see #4493).
----
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4449
Differential Revision: D10370575
Pulled By: abhimadan
fbshipit-source-id: 9a2e152be1ef36969055c0e9eb4beb0d96c11f4d
2018-10-24 19:29:29 +00:00
|
|
|
SequenceNumber max_covering_tombstone_seq = 0;
|
2024-11-05 00:09:34 +00:00
|
|
|
autovector<ReadOnlyMemTable*> to_delete;
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-28 23:34:24 +00:00
|
|
|
|
|
|
|
LookupKey lkey("key1", seq);
|
Add support for wide-column point lookups (#10540)
Summary:
The patch adds a new API `GetEntity` that can be used to perform
wide-column point lookups. It also extends the `Get` code path and
the `MemTable` / `MemTableList` and `Version` / `GetContext` logic
accordingly so that wide-column entities can be served from both
memtables and SSTs. If the result of a lookup is a wide-column entity
(`kTypeWideColumnEntity`), it is passed to the application in deserialized
form; if it is a plain old key-value (`kTypeValue`), it is presented as a
wide-column entity with a single default (anonymous) column.
(In contrast, regular `Get` returns plain old key-values as-is, and
returns the value of the default column for wide-column entities, see
https://github.com/facebook/rocksdb/issues/10483 .)
The result of `GetEntity` is a self-contained `PinnableWideColumns` object.
`PinnableWideColumns` contains a `PinnableSlice`, which either stores the
underlying data in its own buffer or holds on to a cache handle. It also contains
a `WideColumns` instance, which indexes the contents of the `PinnableSlice`,
so applications can access the values of columns efficiently.
There are several pieces of functionality which are currently not supported
for wide-column entities: there is currently no `MultiGetEntity` or wide-column
iterator; also, `Merge` and `GetMergeOperands` are not supported, and there
is no `GetEntity` implementation for read-only and secondary instances.
We plan to implement these in future PRs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10540
Test Plan: `make check`
Reviewed By: akankshamahajan15
Differential Revision: D38847474
Pulled By: ltamasi
fbshipit-source-id: 42311a34ccdfe88b3775e847a5e2a5296e002b5b
2022-08-19 18:51:12 +00:00
|
|
|
bool found = list.current()->Get(lkey, &value, /*columns=*/nullptr,
|
|
|
|
/*timestamp=*/nullptr, &s, &merge_context,
|
|
|
|
&max_covering_tombstone_seq, ReadOptions());
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-28 23:34:24 +00:00
|
|
|
ASSERT_FALSE(found);
|
|
|
|
|
|
|
|
// Create a MemTable
|
|
|
|
InternalKeyComparator cmp(BytewiseComparator());
|
|
|
|
auto factory = std::make_shared<SkipListFactory>();
|
|
|
|
options.memtable_factory = factory;
|
2021-05-05 20:59:21 +00:00
|
|
|
ImmutableOptions ioptions(options);
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-28 23:34:24 +00:00
|
|
|
|
2016-06-21 01:01:03 +00:00
|
|
|
WriteBufferManager wb(options.db_write_buffer_size);
|
2016-09-14 04:11:59 +00:00
|
|
|
MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb,
|
2017-06-02 19:08:01 +00:00
|
|
|
kMaxSequenceNumber, 0 /* column_family_id */);
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-28 23:34:24 +00:00
|
|
|
mem->Ref();
|
|
|
|
|
|
|
|
// Write some keys to this memtable.
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
ASSERT_OK(
|
|
|
|
mem->Add(++seq, kTypeDeletion, "key1", "", nullptr /* kv_prot_info */));
|
|
|
|
ASSERT_OK(mem->Add(++seq, kTypeValue, "key2", "value2",
|
|
|
|
nullptr /* kv_prot_info */));
|
|
|
|
ASSERT_OK(mem->Add(++seq, kTypeValue, "key2", "value2.2",
|
|
|
|
nullptr /* kv_prot_info */));
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-28 23:34:24 +00:00
|
|
|
|
|
|
|
// Fetch the newly written keys
|
|
|
|
merge_context.Clear();
|
2024-08-08 20:34:11 +00:00
|
|
|
s = Status::OK();
|
Add support for wide-column point lookups (#10540)
Summary:
The patch adds a new API `GetEntity` that can be used to perform
wide-column point lookups. It also extends the `Get` code path and
the `MemTable` / `MemTableList` and `Version` / `GetContext` logic
accordingly so that wide-column entities can be served from both
memtables and SSTs. If the result of a lookup is a wide-column entity
(`kTypeWideColumnEntity`), it is passed to the application in deserialized
form; if it is a plain old key-value (`kTypeValue`), it is presented as a
wide-column entity with a single default (anonymous) column.
(In contrast, regular `Get` returns plain old key-values as-is, and
returns the value of the default column for wide-column entities, see
https://github.com/facebook/rocksdb/issues/10483 .)
The result of `GetEntity` is a self-contained `PinnableWideColumns` object.
`PinnableWideColumns` contains a `PinnableSlice`, which either stores the
underlying data in its own buffer or holds on to a cache handle. It also contains
a `WideColumns` instance, which indexes the contents of the `PinnableSlice`,
so applications can access the values of columns efficiently.
There are several pieces of functionality which are currently not supported
for wide-column entities: there is currently no `MultiGetEntity` or wide-column
iterator; also, `Merge` and `GetMergeOperands` are not supported, and there
is no `GetEntity` implementation for read-only and secondary instances.
We plan to implement these in future PRs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10540
Test Plan: `make check`
Reviewed By: akankshamahajan15
Differential Revision: D38847474
Pulled By: ltamasi
fbshipit-source-id: 42311a34ccdfe88b3775e847a5e2a5296e002b5b
2022-08-19 18:51:12 +00:00
|
|
|
found = mem->Get(LookupKey("key1", seq), &value, /*columns*/ nullptr,
|
2022-08-05 19:02:33 +00:00
|
|
|
/*timestamp*/ nullptr, &s, &merge_context,
|
|
|
|
&max_covering_tombstone_seq, ReadOptions(),
|
|
|
|
false /* immutable_memtable */);
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-28 23:34:24 +00:00
|
|
|
// MemTable found out that this key is *not* found (at this sequence#)
|
|
|
|
ASSERT_TRUE(found && s.IsNotFound());
|
|
|
|
|
|
|
|
merge_context.Clear();
|
2024-08-08 20:34:11 +00:00
|
|
|
s = Status::OK();
|
Add support for wide-column point lookups (#10540)
Summary:
The patch adds a new API `GetEntity` that can be used to perform
wide-column point lookups. It also extends the `Get` code path and
the `MemTable` / `MemTableList` and `Version` / `GetContext` logic
accordingly so that wide-column entities can be served from both
memtables and SSTs. If the result of a lookup is a wide-column entity
(`kTypeWideColumnEntity`), it is passed to the application in deserialized
form; if it is a plain old key-value (`kTypeValue`), it is presented as a
wide-column entity with a single default (anonymous) column.
(In contrast, regular `Get` returns plain old key-values as-is, and
returns the value of the default column for wide-column entities, see
https://github.com/facebook/rocksdb/issues/10483 .)
The result of `GetEntity` is a self-contained `PinnableWideColumns` object.
`PinnableWideColumns` contains a `PinnableSlice`, which either stores the
underlying data in its own buffer or holds on to a cache handle. It also contains
a `WideColumns` instance, which indexes the contents of the `PinnableSlice`,
so applications can access the values of columns efficiently.
There are several pieces of functionality which are currently not supported
for wide-column entities: there is currently no `MultiGetEntity` or wide-column
iterator; also, `Merge` and `GetMergeOperands` are not supported, and there
is no `GetEntity` implementation for read-only and secondary instances.
We plan to implement these in future PRs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10540
Test Plan: `make check`
Reviewed By: akankshamahajan15
Differential Revision: D38847474
Pulled By: ltamasi
fbshipit-source-id: 42311a34ccdfe88b3775e847a5e2a5296e002b5b
2022-08-19 18:51:12 +00:00
|
|
|
found = mem->Get(LookupKey("key2", seq), &value, /*columns*/ nullptr,
|
2022-08-05 19:02:33 +00:00
|
|
|
/*timestamp*/ nullptr, &s, &merge_context,
|
|
|
|
&max_covering_tombstone_seq, ReadOptions(),
|
|
|
|
false /* immutable_memtable */);
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-28 23:34:24 +00:00
|
|
|
ASSERT_TRUE(s.ok() && found);
|
|
|
|
ASSERT_EQ(value, "value2.2");
|
|
|
|
|
|
|
|
// Add memtable to list
|
2022-08-05 19:02:33 +00:00
|
|
|
// This is to make assert(memtable->IsFragmentedRangeTombstonesConstructed())
|
|
|
|
// in MemTableListVersion::GetFromList work.
|
|
|
|
mem->ConstructFragmentedRangeTombstones();
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-28 23:34:24 +00:00
|
|
|
list.Add(mem, &to_delete);
|
|
|
|
ASSERT_EQ(0, to_delete.size());
|
|
|
|
|
|
|
|
// Fetch keys via MemTableList
|
|
|
|
merge_context.Clear();
|
2024-08-08 20:34:11 +00:00
|
|
|
s = Status::OK();
|
Add support for wide-column point lookups (#10540)
Summary:
The patch adds a new API `GetEntity` that can be used to perform
wide-column point lookups. It also extends the `Get` code path and
the `MemTable` / `MemTableList` and `Version` / `GetContext` logic
accordingly so that wide-column entities can be served from both
memtables and SSTs. If the result of a lookup is a wide-column entity
(`kTypeWideColumnEntity`), it is passed to the application in deserialized
form; if it is a plain old key-value (`kTypeValue`), it is presented as a
wide-column entity with a single default (anonymous) column.
(In contrast, regular `Get` returns plain old key-values as-is, and
returns the value of the default column for wide-column entities, see
https://github.com/facebook/rocksdb/issues/10483 .)
The result of `GetEntity` is a self-contained `PinnableWideColumns` object.
`PinnableWideColumns` contains a `PinnableSlice`, which either stores the
underlying data in its own buffer or holds on to a cache handle. It also contains
a `WideColumns` instance, which indexes the contents of the `PinnableSlice`,
so applications can access the values of columns efficiently.
There are several pieces of functionality which are currently not supported
for wide-column entities: there is currently no `MultiGetEntity` or wide-column
iterator; also, `Merge` and `GetMergeOperands` are not supported, and there
is no `GetEntity` implementation for read-only and secondary instances.
We plan to implement these in future PRs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10540
Test Plan: `make check`
Reviewed By: akankshamahajan15
Differential Revision: D38847474
Pulled By: ltamasi
fbshipit-source-id: 42311a34ccdfe88b3775e847a5e2a5296e002b5b
2022-08-19 18:51:12 +00:00
|
|
|
found =
|
|
|
|
list.current()->Get(LookupKey("key1", seq), &value, /*columns=*/nullptr,
|
|
|
|
/*timestamp=*/nullptr, &s, &merge_context,
|
|
|
|
&max_covering_tombstone_seq, ReadOptions());
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-28 23:34:24 +00:00
|
|
|
ASSERT_TRUE(found && s.IsNotFound());
|
|
|
|
|
|
|
|
merge_context.Clear();
|
2024-08-08 20:34:11 +00:00
|
|
|
s = Status::OK();
|
Add support for wide-column point lookups (#10540)
Summary:
The patch adds a new API `GetEntity` that can be used to perform
wide-column point lookups. It also extends the `Get` code path and
the `MemTable` / `MemTableList` and `Version` / `GetContext` logic
accordingly so that wide-column entities can be served from both
memtables and SSTs. If the result of a lookup is a wide-column entity
(`kTypeWideColumnEntity`), it is passed to the application in deserialized
form; if it is a plain old key-value (`kTypeValue`), it is presented as a
wide-column entity with a single default (anonymous) column.
(In contrast, regular `Get` returns plain old key-values as-is, and
returns the value of the default column for wide-column entities, see
https://github.com/facebook/rocksdb/issues/10483 .)
The result of `GetEntity` is a self-contained `PinnableWideColumns` object.
`PinnableWideColumns` contains a `PinnableSlice`, which either stores the
underlying data in its own buffer or holds on to a cache handle. It also contains
a `WideColumns` instance, which indexes the contents of the `PinnableSlice`,
so applications can access the values of columns efficiently.
There are several pieces of functionality which are currently not supported
for wide-column entities: there is currently no `MultiGetEntity` or wide-column
iterator; also, `Merge` and `GetMergeOperands` are not supported, and there
is no `GetEntity` implementation for read-only and secondary instances.
We plan to implement these in future PRs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10540
Test Plan: `make check`
Reviewed By: akankshamahajan15
Differential Revision: D38847474
Pulled By: ltamasi
fbshipit-source-id: 42311a34ccdfe88b3775e847a5e2a5296e002b5b
2022-08-19 18:51:12 +00:00
|
|
|
found =
|
|
|
|
list.current()->Get(LookupKey("key2", seq), &value, /*columns=*/nullptr,
|
|
|
|
/*timestamp=*/nullptr, &s, &merge_context,
|
|
|
|
&max_covering_tombstone_seq, ReadOptions());
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-28 23:34:24 +00:00
|
|
|
ASSERT_TRUE(s.ok() && found);
|
|
|
|
ASSERT_EQ("value2.2", value);
|
|
|
|
|
|
|
|
// Flush this memtable from the list.
|
|
|
|
// (It will then be a part of the memtable history).
|
2024-11-05 00:09:34 +00:00
|
|
|
autovector<ReadOnlyMemTable*> to_flush;
|
2022-05-05 20:08:21 +00:00
|
|
|
list.PickMemtablesToFlush(
|
|
|
|
std::numeric_limits<uint64_t>::max() /* memtable_id */, &to_flush);
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-28 23:34:24 +00:00
|
|
|
ASSERT_EQ(1, to_flush.size());
|
|
|
|
|
2018-10-16 02:59:20 +00:00
|
|
|
MutableCFOptions mutable_cf_options(options);
|
|
|
|
s = Mock_InstallMemtableFlushResults(&list, mutable_cf_options, to_flush,
|
|
|
|
&to_delete);
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-28 23:34:24 +00:00
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ(0, list.NumNotFlushed());
|
|
|
|
ASSERT_EQ(1, list.NumFlushed());
|
|
|
|
ASSERT_EQ(0, to_delete.size());
|
|
|
|
|
|
|
|
// Verify keys are no longer in MemTableList
|
|
|
|
merge_context.Clear();
|
Add support for wide-column point lookups (#10540)
Summary:
The patch adds a new API `GetEntity` that can be used to perform
wide-column point lookups. It also extends the `Get` code path and
the `MemTable` / `MemTableList` and `Version` / `GetContext` logic
accordingly so that wide-column entities can be served from both
memtables and SSTs. If the result of a lookup is a wide-column entity
(`kTypeWideColumnEntity`), it is passed to the application in deserialized
form; if it is a plain old key-value (`kTypeValue`), it is presented as a
wide-column entity with a single default (anonymous) column.
(In contrast, regular `Get` returns plain old key-values as-is, and
returns the value of the default column for wide-column entities, see
https://github.com/facebook/rocksdb/issues/10483 .)
The result of `GetEntity` is a self-contained `PinnableWideColumns` object.
`PinnableWideColumns` contains a `PinnableSlice`, which either stores the
underlying data in its own buffer or holds on to a cache handle. It also contains
a `WideColumns` instance, which indexes the contents of the `PinnableSlice`,
so applications can access the values of columns efficiently.
There are several pieces of functionality which are currently not supported
for wide-column entities: there is currently no `MultiGetEntity` or wide-column
iterator; also, `Merge` and `GetMergeOperands` are not supported, and there
is no `GetEntity` implementation for read-only and secondary instances.
We plan to implement these in future PRs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10540
Test Plan: `make check`
Reviewed By: akankshamahajan15
Differential Revision: D38847474
Pulled By: ltamasi
fbshipit-source-id: 42311a34ccdfe88b3775e847a5e2a5296e002b5b
2022-08-19 18:51:12 +00:00
|
|
|
found =
|
|
|
|
list.current()->Get(LookupKey("key1", seq), &value, /*columns=*/nullptr,
|
|
|
|
/*timestamp=*/nullptr, &s, &merge_context,
|
|
|
|
&max_covering_tombstone_seq, ReadOptions());
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-28 23:34:24 +00:00
|
|
|
ASSERT_FALSE(found);
|
|
|
|
|
|
|
|
merge_context.Clear();
|
Add support for wide-column point lookups (#10540)
Summary:
The patch adds a new API `GetEntity` that can be used to perform
wide-column point lookups. It also extends the `Get` code path and
the `MemTable` / `MemTableList` and `Version` / `GetContext` logic
accordingly so that wide-column entities can be served from both
memtables and SSTs. If the result of a lookup is a wide-column entity
(`kTypeWideColumnEntity`), it is passed to the application in deserialized
form; if it is a plain old key-value (`kTypeValue`), it is presented as a
wide-column entity with a single default (anonymous) column.
(In contrast, regular `Get` returns plain old key-values as-is, and
returns the value of the default column for wide-column entities, see
https://github.com/facebook/rocksdb/issues/10483 .)
The result of `GetEntity` is a self-contained `PinnableWideColumns` object.
`PinnableWideColumns` contains a `PinnableSlice`, which either stores the
underlying data in its own buffer or holds on to a cache handle. It also contains
a `WideColumns` instance, which indexes the contents of the `PinnableSlice`,
so applications can access the values of columns efficiently.
There are several pieces of functionality which are currently not supported
for wide-column entities: there is currently no `MultiGetEntity` or wide-column
iterator; also, `Merge` and `GetMergeOperands` are not supported, and there
is no `GetEntity` implementation for read-only and secondary instances.
We plan to implement these in future PRs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10540
Test Plan: `make check`
Reviewed By: akankshamahajan15
Differential Revision: D38847474
Pulled By: ltamasi
fbshipit-source-id: 42311a34ccdfe88b3775e847a5e2a5296e002b5b
2022-08-19 18:51:12 +00:00
|
|
|
found =
|
|
|
|
list.current()->Get(LookupKey("key2", seq), &value, /*columns=*/nullptr,
|
|
|
|
/*timestamp=*/nullptr, &s, &merge_context,
|
|
|
|
&max_covering_tombstone_seq, ReadOptions());
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-28 23:34:24 +00:00
|
|
|
ASSERT_FALSE(found);
|
|
|
|
|
|
|
|
// Verify keys are present in history
|
|
|
|
merge_context.Clear();
|
2024-08-08 20:34:11 +00:00
|
|
|
s = Status::OK();
|
Use only "local" range tombstones during Get (#4449)
Summary:
Previously, range tombstones were accumulated from every level, which
was necessary if a range tombstone in a higher level covered a key in a lower
level. However, RangeDelAggregator::AddTombstones's complexity is based on
the number of tombstones that are currently stored in it, which is wasteful in
the Get case, where we only need to know the highest sequence number of range
tombstones that cover the key from higher levels, and compute the highest covering
sequence number at the current level. This change introduces this optimization, and
removes the use of RangeDelAggregator from the Get path.
In the benchmark results, the following command was used to initialize the database:
```
./db_bench -db=/dev/shm/5k-rts -use_existing_db=false -benchmarks=filluniquerandom -write_buffer_size=1048576 -compression_type=lz4 -target_file_size_base=1048576 -max_bytes_for_level_base=4194304 -value_size=112 -key_size=16 -block_size=4096 -level_compaction_dynamic_level_bytes=true -num=5000000 -max_background_jobs=12 -benchmark_write_rate_limit=20971520 -range_tombstone_width=100 -writes_per_range_tombstone=100 -max_num_range_tombstones=50000 -bloom_bits=8
```
...and the following command was used to measure read throughput:
```
./db_bench -db=/dev/shm/5k-rts/ -use_existing_db=true -benchmarks=readrandom -disable_auto_compactions=true -num=5000000 -reads=100000 -threads=32
```
The filluniquerandom command was only run once, and the resulting database was used
to measure read performance before and after the PR. Both binaries were compiled with
`DEBUG_LEVEL=0`.
Readrandom results before PR:
```
readrandom : 4.544 micros/op 220090 ops/sec; 16.9 MB/s (63103 of 100000 found)
```
Readrandom results after PR:
```
readrandom : 11.147 micros/op 89707 ops/sec; 6.9 MB/s (63103 of 100000 found)
```
So it's actually slower right now, but this PR paves the way for future optimizations (see #4493).
----
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4449
Differential Revision: D10370575
Pulled By: abhimadan
fbshipit-source-id: 9a2e152be1ef36969055c0e9eb4beb0d96c11f4d
2018-10-24 19:29:29 +00:00
|
|
|
found = list.current()->GetFromHistory(
|
Add support for wide-column point lookups (#10540)
Summary:
The patch adds a new API `GetEntity` that can be used to perform
wide-column point lookups. It also extends the `Get` code path and
the `MemTable` / `MemTableList` and `Version` / `GetContext` logic
accordingly so that wide-column entities can be served from both
memtables and SSTs. If the result of a lookup is a wide-column entity
(`kTypeWideColumnEntity`), it is passed to the application in deserialized
form; if it is a plain old key-value (`kTypeValue`), it is presented as a
wide-column entity with a single default (anonymous) column.
(In contrast, regular `Get` returns plain old key-values as-is, and
returns the value of the default column for wide-column entities, see
https://github.com/facebook/rocksdb/issues/10483 .)
The result of `GetEntity` is a self-contained `PinnableWideColumns` object.
`PinnableWideColumns` contains a `PinnableSlice`, which either stores the
underlying data in its own buffer or holds on to a cache handle. It also contains
a `WideColumns` instance, which indexes the contents of the `PinnableSlice`,
so applications can access the values of columns efficiently.
There are several pieces of functionality which are currently not supported
for wide-column entities: there is currently no `MultiGetEntity` or wide-column
iterator; also, `Merge` and `GetMergeOperands` are not supported, and there
is no `GetEntity` implementation for read-only and secondary instances.
We plan to implement these in future PRs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10540
Test Plan: `make check`
Reviewed By: akankshamahajan15
Differential Revision: D38847474
Pulled By: ltamasi
fbshipit-source-id: 42311a34ccdfe88b3775e847a5e2a5296e002b5b
2022-08-19 18:51:12 +00:00
|
|
|
LookupKey("key1", seq), &value, /*columns=*/nullptr,
|
|
|
|
/*timestamp=*/nullptr, &s, &merge_context, &max_covering_tombstone_seq,
|
|
|
|
ReadOptions());
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-28 23:34:24 +00:00
|
|
|
ASSERT_TRUE(found && s.IsNotFound());
|
|
|
|
|
|
|
|
merge_context.Clear();
|
2024-08-08 20:34:11 +00:00
|
|
|
s = Status::OK();
|
Use only "local" range tombstones during Get (#4449)
Summary:
Previously, range tombstones were accumulated from every level, which
was necessary if a range tombstone in a higher level covered a key in a lower
level. However, RangeDelAggregator::AddTombstones's complexity is based on
the number of tombstones that are currently stored in it, which is wasteful in
the Get case, where we only need to know the highest sequence number of range
tombstones that cover the key from higher levels, and compute the highest covering
sequence number at the current level. This change introduces this optimization, and
removes the use of RangeDelAggregator from the Get path.
In the benchmark results, the following command was used to initialize the database:
```
./db_bench -db=/dev/shm/5k-rts -use_existing_db=false -benchmarks=filluniquerandom -write_buffer_size=1048576 -compression_type=lz4 -target_file_size_base=1048576 -max_bytes_for_level_base=4194304 -value_size=112 -key_size=16 -block_size=4096 -level_compaction_dynamic_level_bytes=true -num=5000000 -max_background_jobs=12 -benchmark_write_rate_limit=20971520 -range_tombstone_width=100 -writes_per_range_tombstone=100 -max_num_range_tombstones=50000 -bloom_bits=8
```
...and the following command was used to measure read throughput:
```
./db_bench -db=/dev/shm/5k-rts/ -use_existing_db=true -benchmarks=readrandom -disable_auto_compactions=true -num=5000000 -reads=100000 -threads=32
```
The filluniquerandom command was only run once, and the resulting database was used
to measure read performance before and after the PR. Both binaries were compiled with
`DEBUG_LEVEL=0`.
Readrandom results before PR:
```
readrandom : 4.544 micros/op 220090 ops/sec; 16.9 MB/s (63103 of 100000 found)
```
Readrandom results after PR:
```
readrandom : 11.147 micros/op 89707 ops/sec; 6.9 MB/s (63103 of 100000 found)
```
So it's actually slower right now, but this PR paves the way for future optimizations (see #4493).
----
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4449
Differential Revision: D10370575
Pulled By: abhimadan
fbshipit-source-id: 9a2e152be1ef36969055c0e9eb4beb0d96c11f4d
2018-10-24 19:29:29 +00:00
|
|
|
found = list.current()->GetFromHistory(
|
Add support for wide-column point lookups (#10540)
Summary:
The patch adds a new API `GetEntity` that can be used to perform
wide-column point lookups. It also extends the `Get` code path and
the `MemTable` / `MemTableList` and `Version` / `GetContext` logic
accordingly so that wide-column entities can be served from both
memtables and SSTs. If the result of a lookup is a wide-column entity
(`kTypeWideColumnEntity`), it is passed to the application in deserialized
form; if it is a plain old key-value (`kTypeValue`), it is presented as a
wide-column entity with a single default (anonymous) column.
(In contrast, regular `Get` returns plain old key-values as-is, and
returns the value of the default column for wide-column entities, see
https://github.com/facebook/rocksdb/issues/10483 .)
The result of `GetEntity` is a self-contained `PinnableWideColumns` object.
`PinnableWideColumns` contains a `PinnableSlice`, which either stores the
underlying data in its own buffer or holds on to a cache handle. It also contains
a `WideColumns` instance, which indexes the contents of the `PinnableSlice`,
so applications can access the values of columns efficiently.
There are several pieces of functionality which are currently not supported
for wide-column entities: there is currently no `MultiGetEntity` or wide-column
iterator; also, `Merge` and `GetMergeOperands` are not supported, and there
is no `GetEntity` implementation for read-only and secondary instances.
We plan to implement these in future PRs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10540
Test Plan: `make check`
Reviewed By: akankshamahajan15
Differential Revision: D38847474
Pulled By: ltamasi
fbshipit-source-id: 42311a34ccdfe88b3775e847a5e2a5296e002b5b
2022-08-19 18:51:12 +00:00
|
|
|
LookupKey("key2", seq), &value, /*columns=*/nullptr,
|
|
|
|
/*timestamp=*/nullptr, &s, &merge_context, &max_covering_tombstone_seq,
|
|
|
|
ReadOptions());
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-28 23:34:24 +00:00
|
|
|
ASSERT_TRUE(found);
|
|
|
|
ASSERT_EQ("value2.2", value);
|
|
|
|
|
|
|
|
// Create another memtable and write some keys to it
|
2016-06-21 01:01:03 +00:00
|
|
|
WriteBufferManager wb2(options.db_write_buffer_size);
|
2016-09-14 04:11:59 +00:00
|
|
|
MemTable* mem2 = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb2,
|
2017-06-02 19:08:01 +00:00
|
|
|
kMaxSequenceNumber, 0 /* column_family_id */);
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-28 23:34:24 +00:00
|
|
|
mem2->Ref();
|
|
|
|
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
ASSERT_OK(
|
|
|
|
mem2->Add(++seq, kTypeDeletion, "key1", "", nullptr /* kv_prot_info */));
|
|
|
|
ASSERT_OK(mem2->Add(++seq, kTypeValue, "key3", "value3",
|
|
|
|
nullptr /* kv_prot_info */));
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-28 23:34:24 +00:00
|
|
|
|
|
|
|
// Add second memtable to list
|
2022-08-05 19:02:33 +00:00
|
|
|
// This is to make assert(memtable->IsFragmentedRangeTombstonesConstructed())
|
|
|
|
// in MemTableListVersion::GetFromList work.
|
|
|
|
mem2->ConstructFragmentedRangeTombstones();
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-28 23:34:24 +00:00
|
|
|
list.Add(mem2, &to_delete);
|
|
|
|
ASSERT_EQ(0, to_delete.size());
|
|
|
|
|
|
|
|
to_flush.clear();
|
2022-05-05 20:08:21 +00:00
|
|
|
list.PickMemtablesToFlush(
|
|
|
|
std::numeric_limits<uint64_t>::max() /* memtable_id */, &to_flush);
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-28 23:34:24 +00:00
|
|
|
ASSERT_EQ(1, to_flush.size());
|
|
|
|
|
|
|
|
// Flush second memtable
|
2018-10-16 02:59:20 +00:00
|
|
|
s = Mock_InstallMemtableFlushResults(&list, mutable_cf_options, to_flush,
|
|
|
|
&to_delete);
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-28 23:34:24 +00:00
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ(0, list.NumNotFlushed());
|
|
|
|
ASSERT_EQ(2, list.NumFlushed());
|
|
|
|
ASSERT_EQ(0, to_delete.size());
|
|
|
|
|
|
|
|
// Add a third memtable to push the first memtable out of the history
|
2016-06-21 01:01:03 +00:00
|
|
|
WriteBufferManager wb3(options.db_write_buffer_size);
|
2016-09-14 04:11:59 +00:00
|
|
|
MemTable* mem3 = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb3,
|
2017-06-02 19:08:01 +00:00
|
|
|
kMaxSequenceNumber, 0 /* column_family_id */);
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-28 23:34:24 +00:00
|
|
|
mem3->Ref();
|
2022-08-05 19:02:33 +00:00
|
|
|
// This is to make assert(memtable->IsFragmentedRangeTombstonesConstructed())
|
|
|
|
// in MemTableListVersion::GetFromList work.
|
|
|
|
mem3->ConstructFragmentedRangeTombstones();
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-28 23:34:24 +00:00
|
|
|
list.Add(mem3, &to_delete);
|
|
|
|
ASSERT_EQ(1, list.NumNotFlushed());
|
|
|
|
ASSERT_EQ(1, list.NumFlushed());
|
|
|
|
ASSERT_EQ(1, to_delete.size());
|
|
|
|
|
|
|
|
// Verify keys are no longer in MemTableList
|
|
|
|
merge_context.Clear();
|
2024-08-08 20:34:11 +00:00
|
|
|
s = Status::OK();
|
Add support for wide-column point lookups (#10540)
Summary:
The patch adds a new API `GetEntity` that can be used to perform
wide-column point lookups. It also extends the `Get` code path and
the `MemTable` / `MemTableList` and `Version` / `GetContext` logic
accordingly so that wide-column entities can be served from both
memtables and SSTs. If the result of a lookup is a wide-column entity
(`kTypeWideColumnEntity`), it is passed to the application in deserialized
form; if it is a plain old key-value (`kTypeValue`), it is presented as a
wide-column entity with a single default (anonymous) column.
(In contrast, regular `Get` returns plain old key-values as-is, and
returns the value of the default column for wide-column entities, see
https://github.com/facebook/rocksdb/issues/10483 .)
The result of `GetEntity` is a self-contained `PinnableWideColumns` object.
`PinnableWideColumns` contains a `PinnableSlice`, which either stores the
underlying data in its own buffer or holds on to a cache handle. It also contains
a `WideColumns` instance, which indexes the contents of the `PinnableSlice`,
so applications can access the values of columns efficiently.
There are several pieces of functionality which are currently not supported
for wide-column entities: there is currently no `MultiGetEntity` or wide-column
iterator; also, `Merge` and `GetMergeOperands` are not supported, and there
is no `GetEntity` implementation for read-only and secondary instances.
We plan to implement these in future PRs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10540
Test Plan: `make check`
Reviewed By: akankshamahajan15
Differential Revision: D38847474
Pulled By: ltamasi
fbshipit-source-id: 42311a34ccdfe88b3775e847a5e2a5296e002b5b
2022-08-19 18:51:12 +00:00
|
|
|
found =
|
|
|
|
list.current()->Get(LookupKey("key1", seq), &value, /*columns=*/nullptr,
|
|
|
|
/*timestamp=*/nullptr, &s, &merge_context,
|
|
|
|
&max_covering_tombstone_seq, ReadOptions());
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-28 23:34:24 +00:00
|
|
|
ASSERT_FALSE(found);
|
|
|
|
|
|
|
|
merge_context.Clear();
|
2024-08-08 20:34:11 +00:00
|
|
|
s = Status::OK();
|
Add support for wide-column point lookups (#10540)
Summary:
The patch adds a new API `GetEntity` that can be used to perform
wide-column point lookups. It also extends the `Get` code path and
the `MemTable` / `MemTableList` and `Version` / `GetContext` logic
accordingly so that wide-column entities can be served from both
memtables and SSTs. If the result of a lookup is a wide-column entity
(`kTypeWideColumnEntity`), it is passed to the application in deserialized
form; if it is a plain old key-value (`kTypeValue`), it is presented as a
wide-column entity with a single default (anonymous) column.
(In contrast, regular `Get` returns plain old key-values as-is, and
returns the value of the default column for wide-column entities, see
https://github.com/facebook/rocksdb/issues/10483 .)
The result of `GetEntity` is a self-contained `PinnableWideColumns` object.
`PinnableWideColumns` contains a `PinnableSlice`, which either stores the
underlying data in its own buffer or holds on to a cache handle. It also contains
a `WideColumns` instance, which indexes the contents of the `PinnableSlice`,
so applications can access the values of columns efficiently.
There are several pieces of functionality which are currently not supported
for wide-column entities: there is currently no `MultiGetEntity` or wide-column
iterator; also, `Merge` and `GetMergeOperands` are not supported, and there
is no `GetEntity` implementation for read-only and secondary instances.
We plan to implement these in future PRs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10540
Test Plan: `make check`
Reviewed By: akankshamahajan15
Differential Revision: D38847474
Pulled By: ltamasi
fbshipit-source-id: 42311a34ccdfe88b3775e847a5e2a5296e002b5b
2022-08-19 18:51:12 +00:00
|
|
|
found =
|
|
|
|
list.current()->Get(LookupKey("key2", seq), &value, /*columns=*/nullptr,
|
|
|
|
/*timestamp=*/nullptr, &s, &merge_context,
|
|
|
|
&max_covering_tombstone_seq, ReadOptions());
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-28 23:34:24 +00:00
|
|
|
ASSERT_FALSE(found);
|
|
|
|
|
|
|
|
merge_context.Clear();
|
2024-08-08 20:34:11 +00:00
|
|
|
s = Status::OK();
|
Add support for wide-column point lookups (#10540)
Summary:
The patch adds a new API `GetEntity` that can be used to perform
wide-column point lookups. It also extends the `Get` code path and
the `MemTable` / `MemTableList` and `Version` / `GetContext` logic
accordingly so that wide-column entities can be served from both
memtables and SSTs. If the result of a lookup is a wide-column entity
(`kTypeWideColumnEntity`), it is passed to the application in deserialized
form; if it is a plain old key-value (`kTypeValue`), it is presented as a
wide-column entity with a single default (anonymous) column.
(In contrast, regular `Get` returns plain old key-values as-is, and
returns the value of the default column for wide-column entities, see
https://github.com/facebook/rocksdb/issues/10483 .)
The result of `GetEntity` is a self-contained `PinnableWideColumns` object.
`PinnableWideColumns` contains a `PinnableSlice`, which either stores the
underlying data in its own buffer or holds on to a cache handle. It also contains
a `WideColumns` instance, which indexes the contents of the `PinnableSlice`,
so applications can access the values of columns efficiently.
There are several pieces of functionality which are currently not supported
for wide-column entities: there is currently no `MultiGetEntity` or wide-column
iterator; also, `Merge` and `GetMergeOperands` are not supported, and there
is no `GetEntity` implementation for read-only and secondary instances.
We plan to implement these in future PRs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10540
Test Plan: `make check`
Reviewed By: akankshamahajan15
Differential Revision: D38847474
Pulled By: ltamasi
fbshipit-source-id: 42311a34ccdfe88b3775e847a5e2a5296e002b5b
2022-08-19 18:51:12 +00:00
|
|
|
found =
|
|
|
|
list.current()->Get(LookupKey("key3", seq), &value, /*columns=*/nullptr,
|
|
|
|
/*timestamp=*/nullptr, &s, &merge_context,
|
|
|
|
&max_covering_tombstone_seq, ReadOptions());
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-28 23:34:24 +00:00
|
|
|
ASSERT_FALSE(found);
|
|
|
|
|
|
|
|
// Verify that the second memtable's keys are in the history
|
|
|
|
merge_context.Clear();
|
2024-08-08 20:34:11 +00:00
|
|
|
s = Status::OK();
|
Use only "local" range tombstones during Get (#4449)
Summary:
Previously, range tombstones were accumulated from every level, which
was necessary if a range tombstone in a higher level covered a key in a lower
level. However, RangeDelAggregator::AddTombstones's complexity is based on
the number of tombstones that are currently stored in it, which is wasteful in
the Get case, where we only need to know the highest sequence number of range
tombstones that cover the key from higher levels, and compute the highest covering
sequence number at the current level. This change introduces this optimization, and
removes the use of RangeDelAggregator from the Get path.
In the benchmark results, the following command was used to initialize the database:
```
./db_bench -db=/dev/shm/5k-rts -use_existing_db=false -benchmarks=filluniquerandom -write_buffer_size=1048576 -compression_type=lz4 -target_file_size_base=1048576 -max_bytes_for_level_base=4194304 -value_size=112 -key_size=16 -block_size=4096 -level_compaction_dynamic_level_bytes=true -num=5000000 -max_background_jobs=12 -benchmark_write_rate_limit=20971520 -range_tombstone_width=100 -writes_per_range_tombstone=100 -max_num_range_tombstones=50000 -bloom_bits=8
```
...and the following command was used to measure read throughput:
```
./db_bench -db=/dev/shm/5k-rts/ -use_existing_db=true -benchmarks=readrandom -disable_auto_compactions=true -num=5000000 -reads=100000 -threads=32
```
The filluniquerandom command was only run once, and the resulting database was used
to measure read performance before and after the PR. Both binaries were compiled with
`DEBUG_LEVEL=0`.
Readrandom results before PR:
```
readrandom : 4.544 micros/op 220090 ops/sec; 16.9 MB/s (63103 of 100000 found)
```
Readrandom results after PR:
```
readrandom : 11.147 micros/op 89707 ops/sec; 6.9 MB/s (63103 of 100000 found)
```
So it's actually slower right now, but this PR paves the way for future optimizations (see #4493).
----
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4449
Differential Revision: D10370575
Pulled By: abhimadan
fbshipit-source-id: 9a2e152be1ef36969055c0e9eb4beb0d96c11f4d
2018-10-24 19:29:29 +00:00
|
|
|
found = list.current()->GetFromHistory(
|
Add support for wide-column point lookups (#10540)
Summary:
The patch adds a new API `GetEntity` that can be used to perform
wide-column point lookups. It also extends the `Get` code path and
the `MemTable` / `MemTableList` and `Version` / `GetContext` logic
accordingly so that wide-column entities can be served from both
memtables and SSTs. If the result of a lookup is a wide-column entity
(`kTypeWideColumnEntity`), it is passed to the application in deserialized
form; if it is a plain old key-value (`kTypeValue`), it is presented as a
wide-column entity with a single default (anonymous) column.
(In contrast, regular `Get` returns plain old key-values as-is, and
returns the value of the default column for wide-column entities, see
https://github.com/facebook/rocksdb/issues/10483 .)
The result of `GetEntity` is a self-contained `PinnableWideColumns` object.
`PinnableWideColumns` contains a `PinnableSlice`, which either stores the
underlying data in its own buffer or holds on to a cache handle. It also contains
a `WideColumns` instance, which indexes the contents of the `PinnableSlice`,
so applications can access the values of columns efficiently.
There are several pieces of functionality which are currently not supported
for wide-column entities: there is currently no `MultiGetEntity` or wide-column
iterator; also, `Merge` and `GetMergeOperands` are not supported, and there
is no `GetEntity` implementation for read-only and secondary instances.
We plan to implement these in future PRs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10540
Test Plan: `make check`
Reviewed By: akankshamahajan15
Differential Revision: D38847474
Pulled By: ltamasi
fbshipit-source-id: 42311a34ccdfe88b3775e847a5e2a5296e002b5b
2022-08-19 18:51:12 +00:00
|
|
|
LookupKey("key1", seq), &value, /*columns=*/nullptr,
|
|
|
|
/*timestamp=*/nullptr, &s, &merge_context, &max_covering_tombstone_seq,
|
|
|
|
ReadOptions());
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-28 23:34:24 +00:00
|
|
|
ASSERT_TRUE(found && s.IsNotFound());
|
|
|
|
|
|
|
|
merge_context.Clear();
|
2024-08-08 20:34:11 +00:00
|
|
|
s = Status::OK();
|
Use only "local" range tombstones during Get (#4449)
Summary:
Previously, range tombstones were accumulated from every level, which
was necessary if a range tombstone in a higher level covered a key in a lower
level. However, RangeDelAggregator::AddTombstones's complexity is based on
the number of tombstones that are currently stored in it, which is wasteful in
the Get case, where we only need to know the highest sequence number of range
tombstones that cover the key from higher levels, and compute the highest covering
sequence number at the current level. This change introduces this optimization, and
removes the use of RangeDelAggregator from the Get path.
In the benchmark results, the following command was used to initialize the database:
```
./db_bench -db=/dev/shm/5k-rts -use_existing_db=false -benchmarks=filluniquerandom -write_buffer_size=1048576 -compression_type=lz4 -target_file_size_base=1048576 -max_bytes_for_level_base=4194304 -value_size=112 -key_size=16 -block_size=4096 -level_compaction_dynamic_level_bytes=true -num=5000000 -max_background_jobs=12 -benchmark_write_rate_limit=20971520 -range_tombstone_width=100 -writes_per_range_tombstone=100 -max_num_range_tombstones=50000 -bloom_bits=8
```
...and the following command was used to measure read throughput:
```
./db_bench -db=/dev/shm/5k-rts/ -use_existing_db=true -benchmarks=readrandom -disable_auto_compactions=true -num=5000000 -reads=100000 -threads=32
```
The filluniquerandom command was only run once, and the resulting database was used
to measure read performance before and after the PR. Both binaries were compiled with
`DEBUG_LEVEL=0`.
Readrandom results before PR:
```
readrandom : 4.544 micros/op 220090 ops/sec; 16.9 MB/s (63103 of 100000 found)
```
Readrandom results after PR:
```
readrandom : 11.147 micros/op 89707 ops/sec; 6.9 MB/s (63103 of 100000 found)
```
So it's actually slower right now, but this PR paves the way for future optimizations (see #4493).
----
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4449
Differential Revision: D10370575
Pulled By: abhimadan
fbshipit-source-id: 9a2e152be1ef36969055c0e9eb4beb0d96c11f4d
2018-10-24 19:29:29 +00:00
|
|
|
found = list.current()->GetFromHistory(
|
Add support for wide-column point lookups (#10540)
Summary:
The patch adds a new API `GetEntity` that can be used to perform
wide-column point lookups. It also extends the `Get` code path and
the `MemTable` / `MemTableList` and `Version` / `GetContext` logic
accordingly so that wide-column entities can be served from both
memtables and SSTs. If the result of a lookup is a wide-column entity
(`kTypeWideColumnEntity`), it is passed to the application in deserialized
form; if it is a plain old key-value (`kTypeValue`), it is presented as a
wide-column entity with a single default (anonymous) column.
(In contrast, regular `Get` returns plain old key-values as-is, and
returns the value of the default column for wide-column entities, see
https://github.com/facebook/rocksdb/issues/10483 .)
The result of `GetEntity` is a self-contained `PinnableWideColumns` object.
`PinnableWideColumns` contains a `PinnableSlice`, which either stores the
underlying data in its own buffer or holds on to a cache handle. It also contains
a `WideColumns` instance, which indexes the contents of the `PinnableSlice`,
so applications can access the values of columns efficiently.
There are several pieces of functionality which are currently not supported
for wide-column entities: there is currently no `MultiGetEntity` or wide-column
iterator; also, `Merge` and `GetMergeOperands` are not supported, and there
is no `GetEntity` implementation for read-only and secondary instances.
We plan to implement these in future PRs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10540
Test Plan: `make check`
Reviewed By: akankshamahajan15
Differential Revision: D38847474
Pulled By: ltamasi
fbshipit-source-id: 42311a34ccdfe88b3775e847a5e2a5296e002b5b
2022-08-19 18:51:12 +00:00
|
|
|
LookupKey("key3", seq), &value, /*columns=*/nullptr,
|
|
|
|
/*timestamp=*/nullptr, &s, &merge_context, &max_covering_tombstone_seq,
|
|
|
|
ReadOptions());
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-28 23:34:24 +00:00
|
|
|
ASSERT_TRUE(found);
|
|
|
|
ASSERT_EQ("value3", value);
|
|
|
|
|
|
|
|
// Verify that key2 from the first memtable is no longer in the history
|
|
|
|
merge_context.Clear();
|
2024-08-08 20:34:11 +00:00
|
|
|
s = Status::OK();
|
Add support for wide-column point lookups (#10540)
Summary:
The patch adds a new API `GetEntity` that can be used to perform
wide-column point lookups. It also extends the `Get` code path and
the `MemTable` / `MemTableList` and `Version` / `GetContext` logic
accordingly so that wide-column entities can be served from both
memtables and SSTs. If the result of a lookup is a wide-column entity
(`kTypeWideColumnEntity`), it is passed to the application in deserialized
form; if it is a plain old key-value (`kTypeValue`), it is presented as a
wide-column entity with a single default (anonymous) column.
(In contrast, regular `Get` returns plain old key-values as-is, and
returns the value of the default column for wide-column entities, see
https://github.com/facebook/rocksdb/issues/10483 .)
The result of `GetEntity` is a self-contained `PinnableWideColumns` object.
`PinnableWideColumns` contains a `PinnableSlice`, which either stores the
underlying data in its own buffer or holds on to a cache handle. It also contains
a `WideColumns` instance, which indexes the contents of the `PinnableSlice`,
so applications can access the values of columns efficiently.
There are several pieces of functionality which are currently not supported
for wide-column entities: there is currently no `MultiGetEntity` or wide-column
iterator; also, `Merge` and `GetMergeOperands` are not supported, and there
is no `GetEntity` implementation for read-only and secondary instances.
We plan to implement these in future PRs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10540
Test Plan: `make check`
Reviewed By: akankshamahajan15
Differential Revision: D38847474
Pulled By: ltamasi
fbshipit-source-id: 42311a34ccdfe88b3775e847a5e2a5296e002b5b
2022-08-19 18:51:12 +00:00
|
|
|
found =
|
|
|
|
list.current()->Get(LookupKey("key2", seq), &value, /*columns=*/nullptr,
|
|
|
|
/*timestamp=*/nullptr, &s, &merge_context,
|
|
|
|
&max_covering_tombstone_seq, ReadOptions());
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-28 23:34:24 +00:00
|
|
|
ASSERT_FALSE(found);
|
|
|
|
|
|
|
|
// Cleanup
|
2015-04-10 01:01:11 +00:00
|
|
|
list.current()->Unref(&to_delete);
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-28 23:34:24 +00:00
|
|
|
ASSERT_EQ(3, to_delete.size());
|
2024-11-05 00:09:34 +00:00
|
|
|
for (ReadOnlyMemTable* m : to_delete) {
|
2015-04-10 01:01:11 +00:00
|
|
|
delete m;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(MemTableListTest, FlushPendingTest) {
|
2018-10-16 02:59:20 +00:00
|
|
|
const int num_tables = 6;
|
2015-04-10 01:01:11 +00:00
|
|
|
SequenceNumber seq = 1;
|
|
|
|
Status s;
|
|
|
|
|
|
|
|
auto factory = std::make_shared<SkipListFactory>();
|
|
|
|
options.memtable_factory = factory;
|
2021-05-05 20:59:21 +00:00
|
|
|
ImmutableOptions ioptions(options);
|
2015-04-10 01:01:11 +00:00
|
|
|
InternalKeyComparator cmp(BytewiseComparator());
|
2016-06-21 01:01:03 +00:00
|
|
|
WriteBufferManager wb(options.db_write_buffer_size);
|
2024-11-05 00:09:34 +00:00
|
|
|
autovector<ReadOnlyMemTable*> to_delete;
|
2015-04-10 01:01:11 +00:00
|
|
|
|
|
|
|
// Create MemTableList
|
|
|
|
int min_write_buffer_number_to_merge = 3;
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-28 23:34:24 +00:00
|
|
|
int max_write_buffer_number_to_maintain = 7;
|
Refactor trimming logic for immutable memtables (#5022)
Summary:
MyRocks currently sets `max_write_buffer_number_to_maintain` in order to maintain enough history for transaction conflict checking. The effectiveness of this approach depends on the size of memtables. When memtables are small, it may not keep enough history; when memtables are large, this may consume too much memory.
We are proposing a new way to configure memtable list history: by limiting the memory usage of immutable memtables. The new option is `max_write_buffer_size_to_maintain` and it will take precedence over the old `max_write_buffer_number_to_maintain` if they are both set to non-zero values. The new option accounts for the total memory usage of flushed immutable memtables and mutable memtable. When the total usage exceeds the limit, RocksDB may start dropping immutable memtables (which is also called trimming history), starting from the oldest one.
The semantics of the old option actually works both as an upper bound and lower bound. History trimming will start if number of immutable memtables exceeds the limit, but it will never go below (limit-1) due to history trimming.
In order the mimic the behavior with the new option, history trimming will stop if dropping the next immutable memtable causes the total memory usage go below the size limit. For example, assuming the size limit is set to 64MB, and there are 3 immutable memtables with sizes of 20, 30, 30. Although the total memory usage is 80MB > 64MB, dropping the oldest memtable will reduce the memory usage to 60MB < 64MB, so in this case no memtable will be dropped.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5022
Differential Revision: D14394062
Pulled By: miasantreble
fbshipit-source-id: 60457a509c6af89d0993f988c9b5c2aa9e45f5c5
2019-08-23 20:54:09 +00:00
|
|
|
int64_t max_write_buffer_size_to_maintain =
|
|
|
|
7 * static_cast<int>(options.write_buffer_size);
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-28 23:34:24 +00:00
|
|
|
MemTableList list(min_write_buffer_number_to_merge,
|
Refactor trimming logic for immutable memtables (#5022)
Summary:
MyRocks currently sets `max_write_buffer_number_to_maintain` in order to maintain enough history for transaction conflict checking. The effectiveness of this approach depends on the size of memtables. When memtables are small, it may not keep enough history; when memtables are large, this may consume too much memory.
We are proposing a new way to configure memtable list history: by limiting the memory usage of immutable memtables. The new option is `max_write_buffer_size_to_maintain` and it will take precedence over the old `max_write_buffer_number_to_maintain` if they are both set to non-zero values. The new option accounts for the total memory usage of flushed immutable memtables and mutable memtable. When the total usage exceeds the limit, RocksDB may start dropping immutable memtables (which is also called trimming history), starting from the oldest one.
The semantics of the old option actually works both as an upper bound and lower bound. History trimming will start if number of immutable memtables exceeds the limit, but it will never go below (limit-1) due to history trimming.
In order the mimic the behavior with the new option, history trimming will stop if dropping the next immutable memtable causes the total memory usage go below the size limit. For example, assuming the size limit is set to 64MB, and there are 3 immutable memtables with sizes of 20, 30, 30. Although the total memory usage is 80MB > 64MB, dropping the oldest memtable will reduce the memory usage to 60MB < 64MB, so in this case no memtable will be dropped.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5022
Differential Revision: D14394062
Pulled By: miasantreble
fbshipit-source-id: 60457a509c6af89d0993f988c9b5c2aa9e45f5c5
2019-08-23 20:54:09 +00:00
|
|
|
max_write_buffer_number_to_maintain,
|
|
|
|
max_write_buffer_size_to_maintain);
|
2015-04-10 01:01:11 +00:00
|
|
|
|
|
|
|
// Create some MemTables
|
2018-10-16 02:59:20 +00:00
|
|
|
uint64_t memtable_id = 0;
|
2015-04-10 01:01:11 +00:00
|
|
|
std::vector<MemTable*> tables;
|
2016-09-14 04:11:59 +00:00
|
|
|
MutableCFOptions mutable_cf_options(options);
|
2015-04-10 01:01:11 +00:00
|
|
|
for (int i = 0; i < num_tables; i++) {
|
2015-05-29 21:36:35 +00:00
|
|
|
MemTable* mem = new MemTable(cmp, ioptions, mutable_cf_options, &wb,
|
2017-06-02 19:08:01 +00:00
|
|
|
kMaxSequenceNumber, 0 /* column_family_id */);
|
2018-10-16 02:59:20 +00:00
|
|
|
mem->SetID(memtable_id++);
|
2015-04-10 01:01:11 +00:00
|
|
|
mem->Ref();
|
|
|
|
|
|
|
|
std::string value;
|
|
|
|
MergeContext merge_context;
|
|
|
|
|
2022-05-06 20:03:58 +00:00
|
|
|
ASSERT_OK(mem->Add(++seq, kTypeValue, "key1", std::to_string(i),
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
nullptr /* kv_prot_info */));
|
2022-05-06 20:03:58 +00:00
|
|
|
ASSERT_OK(mem->Add(++seq, kTypeValue, "keyN" + std::to_string(i), "valueN",
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
nullptr /* kv_prot_info */));
|
2022-05-06 20:03:58 +00:00
|
|
|
ASSERT_OK(mem->Add(++seq, kTypeValue, "keyX" + std::to_string(i), "value",
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
nullptr /* kv_prot_info */));
|
2022-05-06 20:03:58 +00:00
|
|
|
ASSERT_OK(mem->Add(++seq, kTypeValue, "keyM" + std::to_string(i), "valueM",
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
nullptr /* kv_prot_info */));
|
2022-05-06 20:03:58 +00:00
|
|
|
ASSERT_OK(mem->Add(++seq, kTypeDeletion, "keyX" + std::to_string(i), "",
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
nullptr /* kv_prot_info */));
|
2015-04-10 01:01:11 +00:00
|
|
|
|
|
|
|
tables.push_back(mem);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Nothing to flush
|
|
|
|
ASSERT_FALSE(list.IsFlushPending());
|
|
|
|
ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
|
2024-11-05 00:09:34 +00:00
|
|
|
autovector<ReadOnlyMemTable*> to_flush;
|
2022-05-05 20:08:21 +00:00
|
|
|
list.PickMemtablesToFlush(
|
|
|
|
std::numeric_limits<uint64_t>::max() /* memtable_id */, &to_flush);
|
2015-04-10 01:01:11 +00:00
|
|
|
ASSERT_EQ(0, to_flush.size());
|
|
|
|
|
|
|
|
// Request a flush even though there is nothing to flush
|
|
|
|
list.FlushRequested();
|
|
|
|
ASSERT_FALSE(list.IsFlushPending());
|
|
|
|
ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
|
|
|
|
|
|
|
|
// Attempt to 'flush' to clear request for flush
|
2022-05-05 20:08:21 +00:00
|
|
|
list.PickMemtablesToFlush(
|
|
|
|
std::numeric_limits<uint64_t>::max() /* memtable_id */, &to_flush);
|
2015-04-10 01:01:11 +00:00
|
|
|
ASSERT_EQ(0, to_flush.size());
|
|
|
|
ASSERT_FALSE(list.IsFlushPending());
|
|
|
|
ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
|
|
|
|
|
|
|
|
// Request a flush again
|
|
|
|
list.FlushRequested();
|
|
|
|
// No flush pending since the list is empty.
|
|
|
|
ASSERT_FALSE(list.IsFlushPending());
|
|
|
|
ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
|
|
|
|
|
|
|
|
// Add 2 tables
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-28 23:34:24 +00:00
|
|
|
list.Add(tables[0], &to_delete);
|
|
|
|
list.Add(tables[1], &to_delete);
|
|
|
|
ASSERT_EQ(2, list.NumNotFlushed());
|
|
|
|
ASSERT_EQ(0, to_delete.size());
|
2015-04-10 01:01:11 +00:00
|
|
|
|
|
|
|
// Even though we have less than the minimum to flush, a flush is
|
|
|
|
// pending since we had previously requested a flush and never called
|
|
|
|
// PickMemtablesToFlush() to clear the flush.
|
|
|
|
ASSERT_TRUE(list.IsFlushPending());
|
|
|
|
ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
|
|
|
|
|
|
|
|
// Pick tables to flush
|
2022-05-05 20:08:21 +00:00
|
|
|
list.PickMemtablesToFlush(
|
|
|
|
std::numeric_limits<uint64_t>::max() /* memtable_id */, &to_flush);
|
2015-04-10 01:01:11 +00:00
|
|
|
ASSERT_EQ(2, to_flush.size());
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-28 23:34:24 +00:00
|
|
|
ASSERT_EQ(2, list.NumNotFlushed());
|
2015-04-10 01:01:11 +00:00
|
|
|
ASSERT_FALSE(list.IsFlushPending());
|
|
|
|
ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
|
|
|
|
|
|
|
|
// Revert flush
|
Rollback other pending memtable flushes when a flush fails (#11865)
Summary:
when atomic_flush=false, there are certain cases where we try to install memtable results with already deleted SST files. This can happen when the following sequence events happen:
```
Start Flush0 for memtable M0 to SST0
Start Flush1 for memtable M1 to SST1
Flush 1 returns OK, but don't install to MANIFEST and let whoever flushes M0 to take care of it
Flush0 finishes with a retryable IOError, it rollbacks M0, (incorrectly) does not rollback M1, and deletes SST0 and SST1
Starts Flush2 for M0, it does not pick up M1 since it thought M1 is flushed
Flush2 writes SST2 and finishes OK, tries to install SST2 and SST1
Error opening SST1 since it's already deleted with an error message like the following:
IO error: No such file or directory: While open a file for random read: /tmp/rocksdbtest-501/db_flush_test_3577_4230653031040984171/000011.sst: No such file or directory
```
This happens since:
1. We currently only rollback the memtables that we are flushing in a flush job when atomic_flush=false.
2. Pending output SSTs from previous flushes are deleted since a pending file number is released whenever a flush job is finished no matter of flush status: https://github.com/facebook/rocksdb/blob/f42e70bf561d4be9b6bbe7316d1c2c0c8a3818e6/db/db_impl/db_impl_compaction_flush.cc#L3161
This PR fixes the issue by rollback these pending flushes.
There is another issue where if a new flush for new memtable starts and finishes after Flush0 finishes. Its output may also be deleted (see more in unit test). It is fixed by checking bg error status before installing a memtable result, and rollback if there is an error.
There is a more efficient fix where we just don't release the pending file output number for flushes that delegate installation. It is more efficient since it does not have to rewrite the flush output file. With the fix in this PR, we can end up with a giant file if a lot of memtables are being flushed together. However, the more efficient fix is a bit more complicated to implement (requires associating such pending file numbers with flush job/memtables) and is more risky since it changes normal flush code path.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11865
Test Plan: * Added repro unit tests.
Reviewed By: anand1976
Differential Revision: D49484922
Pulled By: cbi42
fbshipit-source-id: 25b536c08f4e02e7f1d0f86571663737d2b5d53d
2023-09-21 22:31:29 +00:00
|
|
|
list.RollbackMemtableFlush(to_flush, false);
|
2015-04-10 01:01:11 +00:00
|
|
|
ASSERT_FALSE(list.IsFlushPending());
|
|
|
|
ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
|
|
|
|
to_flush.clear();
|
|
|
|
|
|
|
|
// Add another table
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-28 23:34:24 +00:00
|
|
|
list.Add(tables[2], &to_delete);
|
2015-04-10 01:01:11 +00:00
|
|
|
// We now have the minimum to flush regardles of whether FlushRequested()
|
|
|
|
// was called.
|
|
|
|
ASSERT_TRUE(list.IsFlushPending());
|
|
|
|
ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-28 23:34:24 +00:00
|
|
|
ASSERT_EQ(0, to_delete.size());
|
2015-04-10 01:01:11 +00:00
|
|
|
|
|
|
|
// Pick tables to flush
|
2022-05-05 20:08:21 +00:00
|
|
|
list.PickMemtablesToFlush(
|
|
|
|
std::numeric_limits<uint64_t>::max() /* memtable_id */, &to_flush);
|
2015-04-10 01:01:11 +00:00
|
|
|
ASSERT_EQ(3, to_flush.size());
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-28 23:34:24 +00:00
|
|
|
ASSERT_EQ(3, list.NumNotFlushed());
|
2015-04-10 01:01:11 +00:00
|
|
|
ASSERT_FALSE(list.IsFlushPending());
|
|
|
|
ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
|
|
|
|
|
|
|
|
// Pick tables to flush again
|
2024-11-05 00:09:34 +00:00
|
|
|
autovector<ReadOnlyMemTable*> to_flush2;
|
2022-05-05 20:08:21 +00:00
|
|
|
list.PickMemtablesToFlush(
|
|
|
|
std::numeric_limits<uint64_t>::max() /* memtable_id */, &to_flush2);
|
2015-04-10 01:01:11 +00:00
|
|
|
ASSERT_EQ(0, to_flush2.size());
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-28 23:34:24 +00:00
|
|
|
ASSERT_EQ(3, list.NumNotFlushed());
|
2015-04-10 01:01:11 +00:00
|
|
|
ASSERT_FALSE(list.IsFlushPending());
|
|
|
|
ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
|
|
|
|
|
|
|
|
// Add another table
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-28 23:34:24 +00:00
|
|
|
list.Add(tables[3], &to_delete);
|
2015-04-10 01:01:11 +00:00
|
|
|
ASSERT_FALSE(list.IsFlushPending());
|
|
|
|
ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-28 23:34:24 +00:00
|
|
|
ASSERT_EQ(0, to_delete.size());
|
2015-04-10 01:01:11 +00:00
|
|
|
|
|
|
|
// Request a flush again
|
|
|
|
list.FlushRequested();
|
|
|
|
ASSERT_TRUE(list.IsFlushPending());
|
|
|
|
ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
|
|
|
|
|
|
|
|
// Pick tables to flush again
|
2022-05-05 20:08:21 +00:00
|
|
|
list.PickMemtablesToFlush(
|
|
|
|
std::numeric_limits<uint64_t>::max() /* memtable_id */, &to_flush2);
|
2015-04-10 01:01:11 +00:00
|
|
|
ASSERT_EQ(1, to_flush2.size());
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-28 23:34:24 +00:00
|
|
|
ASSERT_EQ(4, list.NumNotFlushed());
|
2015-04-10 01:01:11 +00:00
|
|
|
ASSERT_FALSE(list.IsFlushPending());
|
|
|
|
ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
|
|
|
|
|
|
|
|
// Rollback first pick of tables
|
Rollback other pending memtable flushes when a flush fails (#11865)
Summary:
when atomic_flush=false, there are certain cases where we try to install memtable results with already deleted SST files. This can happen when the following sequence events happen:
```
Start Flush0 for memtable M0 to SST0
Start Flush1 for memtable M1 to SST1
Flush 1 returns OK, but don't install to MANIFEST and let whoever flushes M0 to take care of it
Flush0 finishes with a retryable IOError, it rollbacks M0, (incorrectly) does not rollback M1, and deletes SST0 and SST1
Starts Flush2 for M0, it does not pick up M1 since it thought M1 is flushed
Flush2 writes SST2 and finishes OK, tries to install SST2 and SST1
Error opening SST1 since it's already deleted with an error message like the following:
IO error: No such file or directory: While open a file for random read: /tmp/rocksdbtest-501/db_flush_test_3577_4230653031040984171/000011.sst: No such file or directory
```
This happens since:
1. We currently only rollback the memtables that we are flushing in a flush job when atomic_flush=false.
2. Pending output SSTs from previous flushes are deleted since a pending file number is released whenever a flush job is finished no matter of flush status: https://github.com/facebook/rocksdb/blob/f42e70bf561d4be9b6bbe7316d1c2c0c8a3818e6/db/db_impl/db_impl_compaction_flush.cc#L3161
This PR fixes the issue by rollback these pending flushes.
There is another issue where if a new flush for new memtable starts and finishes after Flush0 finishes. Its output may also be deleted (see more in unit test). It is fixed by checking bg error status before installing a memtable result, and rollback if there is an error.
There is a more efficient fix where we just don't release the pending file output number for flushes that delegate installation. It is more efficient since it does not have to rewrite the flush output file. With the fix in this PR, we can end up with a giant file if a lot of memtables are being flushed together. However, the more efficient fix is a bit more complicated to implement (requires associating such pending file numbers with flush job/memtables) and is more risky since it changes normal flush code path.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11865
Test Plan: * Added repro unit tests.
Reviewed By: anand1976
Differential Revision: D49484922
Pulled By: cbi42
fbshipit-source-id: 25b536c08f4e02e7f1d0f86571663737d2b5d53d
2023-09-21 22:31:29 +00:00
|
|
|
list.RollbackMemtableFlush(to_flush, false);
|
2015-04-10 01:01:11 +00:00
|
|
|
ASSERT_TRUE(list.IsFlushPending());
|
|
|
|
ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
|
|
|
|
to_flush.clear();
|
|
|
|
|
|
|
|
// Add another tables
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-28 23:34:24 +00:00
|
|
|
list.Add(tables[4], &to_delete);
|
|
|
|
ASSERT_EQ(5, list.NumNotFlushed());
|
2015-04-10 01:01:11 +00:00
|
|
|
// We now have the minimum to flush regardles of whether FlushRequested()
|
|
|
|
ASSERT_TRUE(list.IsFlushPending());
|
|
|
|
ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-28 23:34:24 +00:00
|
|
|
ASSERT_EQ(0, to_delete.size());
|
2015-04-10 01:01:11 +00:00
|
|
|
|
|
|
|
// Pick tables to flush
|
2022-05-05 20:08:21 +00:00
|
|
|
list.PickMemtablesToFlush(
|
|
|
|
std::numeric_limits<uint64_t>::max() /* memtable_id */, &to_flush);
|
2022-11-04 22:55:54 +00:00
|
|
|
// Picks three oldest memtables. The fourth oldest is picked in `to_flush2` so
|
|
|
|
// must be excluded. The newest (fifth oldest) is non-consecutive with the
|
|
|
|
// three oldest due to omitting the fourth oldest so must not be picked.
|
|
|
|
ASSERT_EQ(3, to_flush.size());
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-28 23:34:24 +00:00
|
|
|
ASSERT_EQ(5, list.NumNotFlushed());
|
2015-04-10 01:01:11 +00:00
|
|
|
ASSERT_FALSE(list.IsFlushPending());
|
2022-11-04 22:55:54 +00:00
|
|
|
ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
|
2015-04-10 01:01:11 +00:00
|
|
|
|
|
|
|
// Pick tables to flush again
|
2024-11-05 00:09:34 +00:00
|
|
|
autovector<ReadOnlyMemTable*> to_flush3;
|
2022-05-05 20:08:21 +00:00
|
|
|
list.PickMemtablesToFlush(
|
|
|
|
std::numeric_limits<uint64_t>::max() /* memtable_id */, &to_flush3);
|
2022-11-04 22:55:54 +00:00
|
|
|
// Picks newest (fifth oldest)
|
|
|
|
ASSERT_EQ(1, to_flush3.size());
|
|
|
|
ASSERT_EQ(5, list.NumNotFlushed());
|
|
|
|
ASSERT_FALSE(list.IsFlushPending());
|
|
|
|
ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
|
|
|
|
|
|
|
|
// Nothing left to flush
|
2024-11-05 00:09:34 +00:00
|
|
|
autovector<ReadOnlyMemTable*> to_flush4;
|
2022-11-04 22:55:54 +00:00
|
|
|
list.PickMemtablesToFlush(
|
|
|
|
std::numeric_limits<uint64_t>::max() /* memtable_id */, &to_flush4);
|
|
|
|
ASSERT_EQ(0, to_flush4.size());
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-28 23:34:24 +00:00
|
|
|
ASSERT_EQ(5, list.NumNotFlushed());
|
2015-04-10 01:01:11 +00:00
|
|
|
ASSERT_FALSE(list.IsFlushPending());
|
|
|
|
ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
|
|
|
|
|
2022-11-04 22:55:54 +00:00
|
|
|
// Flush the 3 memtables that were picked in to_flush
|
2018-10-16 02:59:20 +00:00
|
|
|
s = Mock_InstallMemtableFlushResults(&list, mutable_cf_options, to_flush,
|
|
|
|
&to_delete);
|
2015-04-10 01:01:11 +00:00
|
|
|
ASSERT_OK(s);
|
|
|
|
|
2022-11-04 22:55:54 +00:00
|
|
|
// Note: now to_flush contains tables[0,1,2]. to_flush2 contains
|
|
|
|
// tables[3]. to_flush3 contains tables[4].
|
2015-04-10 01:01:11 +00:00
|
|
|
// Current implementation will only commit memtables in the order they were
|
2018-10-05 22:37:45 +00:00
|
|
|
// created. So TryInstallMemtableFlushResults will install the first 3 tables
|
2015-04-10 01:01:11 +00:00
|
|
|
// in to_flush and stop when it encounters a table not yet flushed.
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-28 23:34:24 +00:00
|
|
|
ASSERT_EQ(2, list.NumNotFlushed());
|
Refactor trimming logic for immutable memtables (#5022)
Summary:
MyRocks currently sets `max_write_buffer_number_to_maintain` in order to maintain enough history for transaction conflict checking. The effectiveness of this approach depends on the size of memtables. When memtables are small, it may not keep enough history; when memtables are large, this may consume too much memory.
We are proposing a new way to configure memtable list history: by limiting the memory usage of immutable memtables. The new option is `max_write_buffer_size_to_maintain` and it will take precedence over the old `max_write_buffer_number_to_maintain` if they are both set to non-zero values. The new option accounts for the total memory usage of flushed immutable memtables and mutable memtable. When the total usage exceeds the limit, RocksDB may start dropping immutable memtables (which is also called trimming history), starting from the oldest one.
The semantics of the old option actually works both as an upper bound and lower bound. History trimming will start if number of immutable memtables exceeds the limit, but it will never go below (limit-1) due to history trimming.
In order the mimic the behavior with the new option, history trimming will stop if dropping the next immutable memtable causes the total memory usage go below the size limit. For example, assuming the size limit is set to 64MB, and there are 3 immutable memtables with sizes of 20, 30, 30. Although the total memory usage is 80MB > 64MB, dropping the oldest memtable will reduce the memory usage to 60MB < 64MB, so in this case no memtable will be dropped.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5022
Differential Revision: D14394062
Pulled By: miasantreble
fbshipit-source-id: 60457a509c6af89d0993f988c9b5c2aa9e45f5c5
2019-08-23 20:54:09 +00:00
|
|
|
int num_in_history =
|
|
|
|
std::min(3, static_cast<int>(max_write_buffer_size_to_maintain) /
|
|
|
|
static_cast<int>(options.write_buffer_size));
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-28 23:34:24 +00:00
|
|
|
ASSERT_EQ(num_in_history, list.NumFlushed());
|
|
|
|
ASSERT_EQ(5 - list.NumNotFlushed() - num_in_history, to_delete.size());
|
2015-04-10 01:01:11 +00:00
|
|
|
|
|
|
|
// Request a flush again. Should be nothing to flush
|
|
|
|
list.FlushRequested();
|
|
|
|
ASSERT_FALSE(list.IsFlushPending());
|
|
|
|
ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
|
|
|
|
|
2022-11-04 22:55:54 +00:00
|
|
|
// Flush the 1 memtable (tables[4]) that was picked in to_flush3
|
|
|
|
s = MemTableListTest::Mock_InstallMemtableFlushResults(
|
|
|
|
&list, mutable_cf_options, to_flush3, &to_delete);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// This will install 0 tables since tables[4] flushed while tables[3] has not
|
|
|
|
// yet flushed.
|
|
|
|
ASSERT_EQ(2, list.NumNotFlushed());
|
|
|
|
ASSERT_EQ(0, to_delete.size());
|
|
|
|
|
|
|
|
// Flush the 1 memtable (tables[3]) that was picked in to_flush2
|
2015-04-10 01:01:11 +00:00
|
|
|
s = MemTableListTest::Mock_InstallMemtableFlushResults(
|
2018-10-16 02:59:20 +00:00
|
|
|
&list, mutable_cf_options, to_flush2, &to_delete);
|
2015-04-10 01:01:11 +00:00
|
|
|
ASSERT_OK(s);
|
|
|
|
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-28 23:34:24 +00:00
|
|
|
// This will actually install 2 tables. The 1 we told it to flush, and also
|
2015-04-10 01:01:11 +00:00
|
|
|
// tables[4] which has been waiting for tables[3] to commit.
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-28 23:34:24 +00:00
|
|
|
ASSERT_EQ(0, list.NumNotFlushed());
|
Refactor trimming logic for immutable memtables (#5022)
Summary:
MyRocks currently sets `max_write_buffer_number_to_maintain` in order to maintain enough history for transaction conflict checking. The effectiveness of this approach depends on the size of memtables. When memtables are small, it may not keep enough history; when memtables are large, this may consume too much memory.
We are proposing a new way to configure memtable list history: by limiting the memory usage of immutable memtables. The new option is `max_write_buffer_size_to_maintain` and it will take precedence over the old `max_write_buffer_number_to_maintain` if they are both set to non-zero values. The new option accounts for the total memory usage of flushed immutable memtables and mutable memtable. When the total usage exceeds the limit, RocksDB may start dropping immutable memtables (which is also called trimming history), starting from the oldest one.
The semantics of the old option actually works both as an upper bound and lower bound. History trimming will start if number of immutable memtables exceeds the limit, but it will never go below (limit-1) due to history trimming.
In order the mimic the behavior with the new option, history trimming will stop if dropping the next immutable memtable causes the total memory usage go below the size limit. For example, assuming the size limit is set to 64MB, and there are 3 immutable memtables with sizes of 20, 30, 30. Although the total memory usage is 80MB > 64MB, dropping the oldest memtable will reduce the memory usage to 60MB < 64MB, so in this case no memtable will be dropped.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5022
Differential Revision: D14394062
Pulled By: miasantreble
fbshipit-source-id: 60457a509c6af89d0993f988c9b5c2aa9e45f5c5
2019-08-23 20:54:09 +00:00
|
|
|
num_in_history =
|
|
|
|
std::min(5, static_cast<int>(max_write_buffer_size_to_maintain) /
|
|
|
|
static_cast<int>(options.write_buffer_size));
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-28 23:34:24 +00:00
|
|
|
ASSERT_EQ(num_in_history, list.NumFlushed());
|
|
|
|
ASSERT_EQ(5 - list.NumNotFlushed() - num_in_history, to_delete.size());
|
2015-04-10 01:01:11 +00:00
|
|
|
|
2015-04-10 21:16:03 +00:00
|
|
|
for (const auto& m : to_delete) {
|
2018-10-05 22:37:45 +00:00
|
|
|
// Refcount should be 0 after calling TryInstallMemtableFlushResults.
|
2015-04-10 01:01:11 +00:00
|
|
|
// Verify this, by Ref'ing then UnRef'ing:
|
|
|
|
m->Ref();
|
|
|
|
ASSERT_EQ(m, m->Unref());
|
|
|
|
delete m;
|
|
|
|
}
|
2015-04-10 21:16:03 +00:00
|
|
|
to_delete.clear();
|
|
|
|
|
2018-10-16 02:59:20 +00:00
|
|
|
// Add another table
|
|
|
|
list.Add(tables[5], &to_delete);
|
|
|
|
ASSERT_EQ(1, list.NumNotFlushed());
|
2023-10-02 23:26:24 +00:00
|
|
|
ASSERT_EQ(5, list.GetLatestMemTableID(false /* for_atomic_flush */));
|
2018-10-16 02:59:20 +00:00
|
|
|
memtable_id = 4;
|
|
|
|
// Pick tables to flush. The tables to pick must have ID smaller than or
|
|
|
|
// equal to 4. Therefore, no table will be selected in this case.
|
2024-11-05 00:09:34 +00:00
|
|
|
autovector<ReadOnlyMemTable*> to_flush5;
|
2018-10-16 02:59:20 +00:00
|
|
|
list.FlushRequested();
|
|
|
|
ASSERT_TRUE(list.HasFlushRequested());
|
2022-11-04 22:55:54 +00:00
|
|
|
list.PickMemtablesToFlush(memtable_id, &to_flush5);
|
|
|
|
ASSERT_TRUE(to_flush5.empty());
|
2018-10-16 02:59:20 +00:00
|
|
|
ASSERT_EQ(1, list.NumNotFlushed());
|
|
|
|
ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
|
|
|
|
ASSERT_FALSE(list.IsFlushPending());
|
|
|
|
ASSERT_FALSE(list.HasFlushRequested());
|
|
|
|
|
|
|
|
// Pick tables to flush. The tables to pick must have ID smaller than or
|
|
|
|
// equal to 5. Therefore, only tables[5] will be selected.
|
|
|
|
memtable_id = 5;
|
|
|
|
list.FlushRequested();
|
2022-11-04 22:55:54 +00:00
|
|
|
list.PickMemtablesToFlush(memtable_id, &to_flush5);
|
|
|
|
ASSERT_EQ(1, static_cast<int>(to_flush5.size()));
|
2018-10-16 02:59:20 +00:00
|
|
|
ASSERT_EQ(1, list.NumNotFlushed());
|
|
|
|
ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
|
|
|
|
ASSERT_FALSE(list.IsFlushPending());
|
|
|
|
to_delete.clear();
|
|
|
|
|
2015-04-10 21:16:03 +00:00
|
|
|
list.current()->Unref(&to_delete);
|
2018-10-16 02:59:20 +00:00
|
|
|
int to_delete_size =
|
Refactor trimming logic for immutable memtables (#5022)
Summary:
MyRocks currently sets `max_write_buffer_number_to_maintain` in order to maintain enough history for transaction conflict checking. The effectiveness of this approach depends on the size of memtables. When memtables are small, it may not keep enough history; when memtables are large, this may consume too much memory.
We are proposing a new way to configure memtable list history: by limiting the memory usage of immutable memtables. The new option is `max_write_buffer_size_to_maintain` and it will take precedence over the old `max_write_buffer_number_to_maintain` if they are both set to non-zero values. The new option accounts for the total memory usage of flushed immutable memtables and mutable memtable. When the total usage exceeds the limit, RocksDB may start dropping immutable memtables (which is also called trimming history), starting from the oldest one.
The semantics of the old option actually works both as an upper bound and lower bound. History trimming will start if number of immutable memtables exceeds the limit, but it will never go below (limit-1) due to history trimming.
In order the mimic the behavior with the new option, history trimming will stop if dropping the next immutable memtable causes the total memory usage go below the size limit. For example, assuming the size limit is set to 64MB, and there are 3 immutable memtables with sizes of 20, 30, 30. Although the total memory usage is 80MB > 64MB, dropping the oldest memtable will reduce the memory usage to 60MB < 64MB, so in this case no memtable will be dropped.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5022
Differential Revision: D14394062
Pulled By: miasantreble
fbshipit-source-id: 60457a509c6af89d0993f988c9b5c2aa9e45f5c5
2019-08-23 20:54:09 +00:00
|
|
|
std::min(num_tables, static_cast<int>(max_write_buffer_size_to_maintain) /
|
|
|
|
static_cast<int>(options.write_buffer_size));
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
2015-05-28 23:34:24 +00:00
|
|
|
ASSERT_EQ(to_delete_size, to_delete.size());
|
2015-05-29 21:36:35 +00:00
|
|
|
|
|
|
|
for (const auto& m : to_delete) {
|
2018-10-05 22:37:45 +00:00
|
|
|
// Refcount should be 0 after calling TryInstallMemtableFlushResults.
|
2015-05-29 21:36:35 +00:00
|
|
|
// Verify this, by Ref'ing then UnRef'ing:
|
|
|
|
m->Ref();
|
|
|
|
ASSERT_EQ(m, m->Unref());
|
|
|
|
delete m;
|
|
|
|
}
|
|
|
|
to_delete.clear();
|
2015-04-10 01:01:11 +00:00
|
|
|
}
|
|
|
|
|
2023-07-26 23:25:06 +00:00
|
|
|
TEST_F(MemTableListTest, EmptyAtomicFlushTest) {
|
2019-01-04 04:53:52 +00:00
|
|
|
autovector<MemTableList*> lists;
|
|
|
|
autovector<uint32_t> cf_ids;
|
|
|
|
autovector<const MutableCFOptions*> options_list;
|
2024-11-05 00:09:34 +00:00
|
|
|
autovector<const autovector<ReadOnlyMemTable*>*> to_flush;
|
|
|
|
autovector<ReadOnlyMemTable*> to_delete;
|
2019-01-04 04:53:52 +00:00
|
|
|
Status s = Mock_InstallMemtableAtomicFlushResults(lists, cf_ids, options_list,
|
|
|
|
to_flush, &to_delete);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_TRUE(to_delete.empty());
|
|
|
|
}
|
|
|
|
|
2023-07-26 23:25:06 +00:00
|
|
|
TEST_F(MemTableListTest, AtomicFlushTest) {
|
2018-10-16 02:59:20 +00:00
|
|
|
const int num_cfs = 3;
|
2019-01-04 04:53:52 +00:00
|
|
|
const int num_tables_per_cf = 2;
|
2018-10-16 02:59:20 +00:00
|
|
|
SequenceNumber seq = 1;
|
|
|
|
|
|
|
|
auto factory = std::make_shared<SkipListFactory>();
|
|
|
|
options.memtable_factory = factory;
|
2021-05-05 20:59:21 +00:00
|
|
|
ImmutableOptions ioptions(options);
|
2018-10-16 02:59:20 +00:00
|
|
|
InternalKeyComparator cmp(BytewiseComparator());
|
|
|
|
WriteBufferManager wb(options.db_write_buffer_size);
|
|
|
|
|
|
|
|
// Create MemTableLists
|
|
|
|
int min_write_buffer_number_to_merge = 3;
|
|
|
|
int max_write_buffer_number_to_maintain = 7;
|
Refactor trimming logic for immutable memtables (#5022)
Summary:
MyRocks currently sets `max_write_buffer_number_to_maintain` in order to maintain enough history for transaction conflict checking. The effectiveness of this approach depends on the size of memtables. When memtables are small, it may not keep enough history; when memtables are large, this may consume too much memory.
We are proposing a new way to configure memtable list history: by limiting the memory usage of immutable memtables. The new option is `max_write_buffer_size_to_maintain` and it will take precedence over the old `max_write_buffer_number_to_maintain` if they are both set to non-zero values. The new option accounts for the total memory usage of flushed immutable memtables and mutable memtable. When the total usage exceeds the limit, RocksDB may start dropping immutable memtables (which is also called trimming history), starting from the oldest one.
The semantics of the old option actually works both as an upper bound and lower bound. History trimming will start if number of immutable memtables exceeds the limit, but it will never go below (limit-1) due to history trimming.
In order the mimic the behavior with the new option, history trimming will stop if dropping the next immutable memtable causes the total memory usage go below the size limit. For example, assuming the size limit is set to 64MB, and there are 3 immutable memtables with sizes of 20, 30, 30. Although the total memory usage is 80MB > 64MB, dropping the oldest memtable will reduce the memory usage to 60MB < 64MB, so in this case no memtable will be dropped.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5022
Differential Revision: D14394062
Pulled By: miasantreble
fbshipit-source-id: 60457a509c6af89d0993f988c9b5c2aa9e45f5c5
2019-08-23 20:54:09 +00:00
|
|
|
int64_t max_write_buffer_size_to_maintain =
|
|
|
|
7 * static_cast<int64_t>(options.write_buffer_size);
|
2018-10-16 02:59:20 +00:00
|
|
|
autovector<MemTableList*> lists;
|
|
|
|
for (int i = 0; i != num_cfs; ++i) {
|
|
|
|
lists.emplace_back(new MemTableList(min_write_buffer_number_to_merge,
|
Refactor trimming logic for immutable memtables (#5022)
Summary:
MyRocks currently sets `max_write_buffer_number_to_maintain` in order to maintain enough history for transaction conflict checking. The effectiveness of this approach depends on the size of memtables. When memtables are small, it may not keep enough history; when memtables are large, this may consume too much memory.
We are proposing a new way to configure memtable list history: by limiting the memory usage of immutable memtables. The new option is `max_write_buffer_size_to_maintain` and it will take precedence over the old `max_write_buffer_number_to_maintain` if they are both set to non-zero values. The new option accounts for the total memory usage of flushed immutable memtables and mutable memtable. When the total usage exceeds the limit, RocksDB may start dropping immutable memtables (which is also called trimming history), starting from the oldest one.
The semantics of the old option actually works both as an upper bound and lower bound. History trimming will start if number of immutable memtables exceeds the limit, but it will never go below (limit-1) due to history trimming.
In order the mimic the behavior with the new option, history trimming will stop if dropping the next immutable memtable causes the total memory usage go below the size limit. For example, assuming the size limit is set to 64MB, and there are 3 immutable memtables with sizes of 20, 30, 30. Although the total memory usage is 80MB > 64MB, dropping the oldest memtable will reduce the memory usage to 60MB < 64MB, so in this case no memtable will be dropped.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5022
Differential Revision: D14394062
Pulled By: miasantreble
fbshipit-source-id: 60457a509c6af89d0993f988c9b5c2aa9e45f5c5
2019-08-23 20:54:09 +00:00
|
|
|
max_write_buffer_number_to_maintain,
|
|
|
|
max_write_buffer_size_to_maintain));
|
2018-10-16 02:59:20 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
autovector<uint32_t> cf_ids;
|
|
|
|
std::vector<std::vector<MemTable*>> tables(num_cfs);
|
|
|
|
autovector<const MutableCFOptions*> mutable_cf_options_list;
|
|
|
|
uint32_t cf_id = 0;
|
|
|
|
for (auto& elem : tables) {
|
|
|
|
mutable_cf_options_list.emplace_back(new MutableCFOptions(options));
|
|
|
|
uint64_t memtable_id = 0;
|
|
|
|
for (int i = 0; i != num_tables_per_cf; ++i) {
|
|
|
|
MemTable* mem =
|
|
|
|
new MemTable(cmp, ioptions, *(mutable_cf_options_list.back()), &wb,
|
|
|
|
kMaxSequenceNumber, cf_id);
|
|
|
|
mem->SetID(memtable_id++);
|
|
|
|
mem->Ref();
|
|
|
|
|
|
|
|
std::string value;
|
|
|
|
|
2022-05-06 20:03:58 +00:00
|
|
|
ASSERT_OK(mem->Add(++seq, kTypeValue, "key1", std::to_string(i),
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
nullptr /* kv_prot_info */));
|
2022-05-06 20:03:58 +00:00
|
|
|
ASSERT_OK(mem->Add(++seq, kTypeValue, "keyN" + std::to_string(i),
|
|
|
|
"valueN", nullptr /* kv_prot_info */));
|
|
|
|
ASSERT_OK(mem->Add(++seq, kTypeValue, "keyX" + std::to_string(i), "value",
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
nullptr /* kv_prot_info */));
|
2022-05-06 20:03:58 +00:00
|
|
|
ASSERT_OK(mem->Add(++seq, kTypeValue, "keyM" + std::to_string(i),
|
|
|
|
"valueM", nullptr /* kv_prot_info */));
|
|
|
|
ASSERT_OK(mem->Add(++seq, kTypeDeletion, "keyX" + std::to_string(i), "",
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
nullptr /* kv_prot_info */));
|
2018-10-16 02:59:20 +00:00
|
|
|
|
|
|
|
elem.push_back(mem);
|
|
|
|
}
|
|
|
|
cf_ids.push_back(cf_id++);
|
|
|
|
}
|
|
|
|
|
2024-11-05 00:09:34 +00:00
|
|
|
std::vector<autovector<ReadOnlyMemTable*>> flush_candidates(num_cfs);
|
2018-10-16 02:59:20 +00:00
|
|
|
|
|
|
|
// Nothing to flush
|
2019-01-04 04:53:52 +00:00
|
|
|
for (auto i = 0; i != num_cfs; ++i) {
|
|
|
|
auto* list = lists[i];
|
2018-10-16 02:59:20 +00:00
|
|
|
ASSERT_FALSE(list->IsFlushPending());
|
|
|
|
ASSERT_FALSE(list->imm_flush_needed.load(std::memory_order_acquire));
|
2022-05-05 20:08:21 +00:00
|
|
|
list->PickMemtablesToFlush(
|
|
|
|
std::numeric_limits<uint64_t>::max() /* memtable_id */,
|
|
|
|
&flush_candidates[i]);
|
2019-01-04 04:53:52 +00:00
|
|
|
ASSERT_EQ(0, flush_candidates[i].size());
|
2018-10-16 02:59:20 +00:00
|
|
|
}
|
|
|
|
// Request flush even though there is nothing to flush
|
2019-01-04 04:53:52 +00:00
|
|
|
for (auto i = 0; i != num_cfs; ++i) {
|
|
|
|
auto* list = lists[i];
|
2018-10-16 02:59:20 +00:00
|
|
|
list->FlushRequested();
|
|
|
|
ASSERT_FALSE(list->IsFlushPending());
|
|
|
|
ASSERT_FALSE(list->imm_flush_needed.load(std::memory_order_acquire));
|
|
|
|
}
|
2024-11-05 00:09:34 +00:00
|
|
|
autovector<ReadOnlyMemTable*> to_delete;
|
2019-01-04 04:53:52 +00:00
|
|
|
// Add tables to the immutable memtalbe lists associated with column families
|
|
|
|
for (auto i = 0; i != num_cfs; ++i) {
|
|
|
|
for (auto j = 0; j != num_tables_per_cf; ++j) {
|
2018-10-16 02:59:20 +00:00
|
|
|
lists[i]->Add(tables[i][j], &to_delete);
|
|
|
|
}
|
|
|
|
ASSERT_EQ(num_tables_per_cf, lists[i]->NumNotFlushed());
|
|
|
|
ASSERT_TRUE(lists[i]->IsFlushPending());
|
|
|
|
ASSERT_TRUE(lists[i]->imm_flush_needed.load(std::memory_order_acquire));
|
|
|
|
}
|
2019-01-04 04:53:52 +00:00
|
|
|
std::vector<uint64_t> flush_memtable_ids = {1, 1, 0};
|
|
|
|
// +----+
|
|
|
|
// list[0]: |0 1|
|
|
|
|
// list[1]: |0 1|
|
|
|
|
// | +--+
|
|
|
|
// list[2]: |0| 1
|
|
|
|
// +-+
|
|
|
|
// Pick memtables to flush
|
|
|
|
for (auto i = 0; i != num_cfs; ++i) {
|
|
|
|
flush_candidates[i].clear();
|
2020-12-02 17:29:50 +00:00
|
|
|
lists[i]->PickMemtablesToFlush(flush_memtable_ids[i], &flush_candidates[i]);
|
2019-01-04 04:53:52 +00:00
|
|
|
ASSERT_EQ(flush_memtable_ids[i] - 0 + 1,
|
|
|
|
static_cast<uint64_t>(flush_candidates[i].size()));
|
|
|
|
}
|
|
|
|
autovector<MemTableList*> tmp_lists;
|
|
|
|
autovector<uint32_t> tmp_cf_ids;
|
|
|
|
autovector<const MutableCFOptions*> tmp_options_list;
|
2024-11-05 00:09:34 +00:00
|
|
|
autovector<const autovector<ReadOnlyMemTable*>*> to_flush;
|
2019-01-04 04:53:52 +00:00
|
|
|
for (auto i = 0; i != num_cfs; ++i) {
|
|
|
|
if (!flush_candidates[i].empty()) {
|
|
|
|
to_flush.push_back(&flush_candidates[i]);
|
|
|
|
tmp_lists.push_back(lists[i]);
|
|
|
|
tmp_cf_ids.push_back(i);
|
|
|
|
tmp_options_list.push_back(mutable_cf_options_list[i]);
|
2018-10-16 02:59:20 +00:00
|
|
|
}
|
2019-01-04 04:53:52 +00:00
|
|
|
}
|
|
|
|
Status s = Mock_InstallMemtableAtomicFlushResults(
|
|
|
|
tmp_lists, tmp_cf_ids, tmp_options_list, to_flush, &to_delete);
|
|
|
|
ASSERT_OK(s);
|
2018-10-16 02:59:20 +00:00
|
|
|
|
2019-01-04 04:53:52 +00:00
|
|
|
for (auto i = 0; i != num_cfs; ++i) {
|
|
|
|
for (auto j = 0; j != num_tables_per_cf; ++j) {
|
|
|
|
if (static_cast<uint64_t>(j) <= flush_memtable_ids[i]) {
|
|
|
|
ASSERT_LT(0, tables[i][j]->GetFileNumber());
|
2018-10-16 02:59:20 +00:00
|
|
|
}
|
|
|
|
}
|
2019-01-04 04:53:52 +00:00
|
|
|
ASSERT_EQ(
|
|
|
|
static_cast<size_t>(num_tables_per_cf) - flush_candidates[i].size(),
|
|
|
|
lists[i]->NumNotFlushed());
|
2018-10-16 02:59:20 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
to_delete.clear();
|
|
|
|
for (auto list : lists) {
|
|
|
|
list->current()->Unref(&to_delete);
|
|
|
|
delete list;
|
|
|
|
}
|
|
|
|
for (auto& mutable_cf_options : mutable_cf_options_list) {
|
|
|
|
if (mutable_cf_options != nullptr) {
|
|
|
|
delete mutable_cf_options;
|
|
|
|
mutable_cf_options = nullptr;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// All memtables in tables array must have been flushed, thus ready to be
|
|
|
|
// deleted.
|
|
|
|
ASSERT_EQ(to_delete.size(), tables.size() * tables.front().size());
|
|
|
|
for (const auto& m : to_delete) {
|
|
|
|
// Refcount should be 0 after calling InstallMemtableFlushResults.
|
|
|
|
// Verify this by Ref'ing and then Unref'ing.
|
|
|
|
m->Ref();
|
|
|
|
ASSERT_EQ(m, m->Unref());
|
|
|
|
delete m;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-07-26 23:25:06 +00:00
|
|
|
class MemTableListWithTimestampTest : public MemTableListTest {
|
|
|
|
public:
|
|
|
|
MemTableListWithTimestampTest() : MemTableListTest() {}
|
|
|
|
|
|
|
|
void SetUp() override { udt_enabled_ = true; }
|
|
|
|
};
|
|
|
|
|
|
|
|
TEST_F(MemTableListWithTimestampTest, GetTableNewestUDT) {
|
|
|
|
const int num_tables = 3;
|
|
|
|
const int num_entries = 5;
|
|
|
|
SequenceNumber seq = 1;
|
|
|
|
|
|
|
|
auto factory = std::make_shared<SkipListFactory>();
|
|
|
|
options.memtable_factory = factory;
|
|
|
|
options.persist_user_defined_timestamps = false;
|
|
|
|
ImmutableOptions ioptions(options);
|
|
|
|
const Comparator* ucmp = test::BytewiseComparatorWithU64TsWrapper();
|
|
|
|
InternalKeyComparator cmp(ucmp);
|
|
|
|
WriteBufferManager wb(options.db_write_buffer_size);
|
|
|
|
|
|
|
|
// Create MemTableList
|
|
|
|
int min_write_buffer_number_to_merge = 1;
|
|
|
|
int max_write_buffer_number_to_maintain = 4;
|
|
|
|
int64_t max_write_buffer_size_to_maintain =
|
|
|
|
4 * static_cast<int>(options.write_buffer_size);
|
|
|
|
MemTableList list(min_write_buffer_number_to_merge,
|
|
|
|
max_write_buffer_number_to_maintain,
|
|
|
|
max_write_buffer_size_to_maintain);
|
|
|
|
|
|
|
|
// Create some MemTables
|
|
|
|
uint64_t memtable_id = 0;
|
|
|
|
std::vector<MemTable*> tables;
|
|
|
|
MutableCFOptions mutable_cf_options(options);
|
|
|
|
uint64_t current_ts = 0;
|
2024-11-05 00:09:34 +00:00
|
|
|
autovector<ReadOnlyMemTable*> to_delete;
|
2023-07-26 23:25:06 +00:00
|
|
|
std::vector<std::string> newest_udts;
|
|
|
|
|
|
|
|
std::string key;
|
|
|
|
std::string write_ts;
|
|
|
|
for (int i = 0; i < num_tables; i++) {
|
|
|
|
MemTable* mem = new MemTable(cmp, ioptions, mutable_cf_options, &wb,
|
|
|
|
kMaxSequenceNumber, 0 /* column_family_id */);
|
|
|
|
mem->SetID(memtable_id++);
|
|
|
|
mem->Ref();
|
|
|
|
|
|
|
|
std::string value;
|
|
|
|
MergeContext merge_context;
|
|
|
|
|
|
|
|
for (int j = 0; j < num_entries; j++) {
|
|
|
|
key = "key1";
|
|
|
|
write_ts.clear();
|
|
|
|
PutFixed64(&write_ts, current_ts);
|
|
|
|
key.append(write_ts);
|
|
|
|
ASSERT_OK(mem->Add(++seq, kTypeValue, key, std::to_string(i),
|
|
|
|
nullptr /* kv_prot_info */));
|
|
|
|
current_ts++;
|
|
|
|
}
|
|
|
|
|
|
|
|
tables.push_back(mem);
|
|
|
|
list.Add(tables.back(), &to_delete);
|
|
|
|
newest_udts.push_back(write_ts);
|
|
|
|
}
|
|
|
|
|
|
|
|
ASSERT_EQ(num_tables, list.NumNotFlushed());
|
|
|
|
ASSERT_TRUE(list.IsFlushPending());
|
|
|
|
std::vector<Slice> tables_newest_udts = list.GetTablesNewestUDT(num_tables);
|
|
|
|
ASSERT_EQ(newest_udts.size(), tables_newest_udts.size());
|
|
|
|
for (size_t i = 0; i < tables_newest_udts.size(); i++) {
|
|
|
|
const Slice& table_newest_udt = tables_newest_udts[i];
|
|
|
|
const Slice expected_newest_udt = newest_udts[i];
|
|
|
|
ASSERT_EQ(expected_newest_udt, table_newest_udt);
|
|
|
|
}
|
|
|
|
|
|
|
|
list.current()->Unref(&to_delete);
|
2024-11-05 00:09:34 +00:00
|
|
|
for (ReadOnlyMemTable* m : to_delete) {
|
2023-07-26 23:25:06 +00:00
|
|
|
delete m;
|
|
|
|
}
|
|
|
|
to_delete.clear();
|
|
|
|
}
|
|
|
|
|
2020-02-20 20:07:53 +00:00
|
|
|
} // namespace ROCKSDB_NAMESPACE
|
2015-04-10 01:01:11 +00:00
|
|
|
|
|
|
|
int main(int argc, char** argv) {
|
2022-10-18 07:35:35 +00:00
|
|
|
ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
|
2015-04-10 01:01:11 +00:00
|
|
|
::testing::InitGoogleTest(&argc, argv);
|
|
|
|
return RUN_ALL_TESTS();
|
|
|
|
}
|