2016-02-09 23:12:00 +00:00
|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
2017-07-15 23:03:42 +00:00
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
2013-10-16 21:59:46 +00:00
|
|
|
//
|
2011-03-18 22:37:00 +00:00
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
|
2013-07-23 21:42:27 +00:00
|
|
|
#include <memory>
|
2020-11-17 01:58:59 +00:00
|
|
|
|
2014-03-14 20:40:06 +00:00
|
|
|
#include "db/column_family.h"
|
2020-11-17 01:58:59 +00:00
|
|
|
#include "db/db_test_util.h"
|
2016-06-21 01:01:03 +00:00
|
|
|
#include "db/memtable.h"
|
2023-11-07 00:52:51 +00:00
|
|
|
#include "db/wide/wide_columns_helper.h"
|
2011-03-18 22:37:00 +00:00
|
|
|
#include "db/write_batch_internal.h"
|
2023-11-07 00:52:51 +00:00
|
|
|
#include "dbformat.h"
|
2021-12-01 06:31:41 +00:00
|
|
|
#include "rocksdb/comparator.h"
|
2020-11-17 01:58:59 +00:00
|
|
|
#include "rocksdb/db.h"
|
2013-08-23 15:38:13 +00:00
|
|
|
#include "rocksdb/env.h"
|
|
|
|
#include "rocksdb/memtablerep.h"
|
2014-08-18 22:19:17 +00:00
|
|
|
#include "rocksdb/utilities/write_batch_with_index.h"
|
2016-06-21 01:01:03 +00:00
|
|
|
#include "rocksdb/write_buffer_manager.h"
|
2019-05-30 18:21:38 +00:00
|
|
|
#include "test_util/testharness.h"
|
2021-12-01 06:31:41 +00:00
|
|
|
#include "test_util/testutil.h"
|
2019-05-31 00:39:43 +00:00
|
|
|
#include "util/string_util.h"
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2020-02-20 20:07:53 +00:00
|
|
|
namespace ROCKSDB_NAMESPACE {
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2020-11-17 01:58:59 +00:00
|
|
|
static std::string PrintContents(WriteBatch* b,
|
|
|
|
bool merge_operator_supported = true) {
|
2011-03-18 22:37:00 +00:00
|
|
|
InternalKeyComparator cmp(BytewiseComparator());
|
2013-07-23 21:42:27 +00:00
|
|
|
auto factory = std::make_shared<SkipListFactory>();
|
2014-01-14 23:32:37 +00:00
|
|
|
Options options;
|
|
|
|
options.memtable_factory = factory;
|
2020-11-17 01:58:59 +00:00
|
|
|
if (merge_operator_supported) {
|
|
|
|
options.merge_operator.reset(new TestPutOperator());
|
|
|
|
}
|
2021-05-05 20:59:21 +00:00
|
|
|
ImmutableOptions ioptions(options);
|
2016-06-21 01:01:03 +00:00
|
|
|
WriteBufferManager wb(options.db_write_buffer_size);
|
2016-09-14 04:11:59 +00:00
|
|
|
MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb,
|
2017-06-02 19:08:01 +00:00
|
|
|
kMaxSequenceNumber, 0 /* column_family_id */);
|
2011-05-21 02:17:43 +00:00
|
|
|
mem->Ref();
|
2011-03-18 22:37:00 +00:00
|
|
|
std::string state;
|
2014-11-18 18:20:10 +00:00
|
|
|
ColumnFamilyMemTablesDefault cf_mems_default(mem);
|
Refactor trimming logic for immutable memtables (#5022)
Summary:
MyRocks currently sets `max_write_buffer_number_to_maintain` in order to maintain enough history for transaction conflict checking. The effectiveness of this approach depends on the size of memtables. When memtables are small, it may not keep enough history; when memtables are large, this may consume too much memory.
We are proposing a new way to configure memtable list history: by limiting the memory usage of immutable memtables. The new option is `max_write_buffer_size_to_maintain` and it will take precedence over the old `max_write_buffer_number_to_maintain` if they are both set to non-zero values. The new option accounts for the total memory usage of flushed immutable memtables and mutable memtable. When the total usage exceeds the limit, RocksDB may start dropping immutable memtables (which is also called trimming history), starting from the oldest one.
The semantics of the old option actually works both as an upper bound and lower bound. History trimming will start if number of immutable memtables exceeds the limit, but it will never go below (limit-1) due to history trimming.
In order the mimic the behavior with the new option, history trimming will stop if dropping the next immutable memtable causes the total memory usage go below the size limit. For example, assuming the size limit is set to 64MB, and there are 3 immutable memtables with sizes of 20, 30, 30. Although the total memory usage is 80MB > 64MB, dropping the oldest memtable will reduce the memory usage to 60MB < 64MB, so in this case no memtable will be dropped.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5022
Differential Revision: D14394062
Pulled By: miasantreble
fbshipit-source-id: 60457a509c6af89d0993f988c9b5c2aa9e45f5c5
2019-08-23 20:54:09 +00:00
|
|
|
Status s =
|
|
|
|
WriteBatchInternal::InsertInto(b, &cf_mems_default, nullptr, nullptr);
|
2019-09-09 18:22:28 +00:00
|
|
|
uint32_t count = 0;
|
2015-11-06 15:03:30 +00:00
|
|
|
int put_count = 0;
|
Add initial support for TimedPut API (#12419)
Summary:
This PR adds support for `TimedPut` API. We introduced a new type `kTypeValuePreferredSeqno` for entries added to the DB via the `TimedPut` API.
The life cycle of such an entry on the write/flush/compaction paths are:
1) It is initially added to memtable as:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, write_unix_time}`
2) When it's flushed to L0 sst files, it's converted to:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, preferred_seqno}`
when we have easy access to the seqno to time mapping.
3) During compaction, if certain conditions are met, we swap in the `preferred_seqno` and the entry will become:
`<user_key, preferred_seqno, kTypeValue>: value`. This step helps fast track these entries to the cold tier if they are eligible after the sequence number swap.
On the read path:
A `kTypeValuePreferredSeqno` entry acts the same as a `kTypeValue` entry, the unix_write_time/preferred seqno part packed in value is completely ignored.
Needed follow ups:
1) The seqno to time mapping accessible in flush needs to be extended to cover the `write_unix_time` for possible `kTypeValuePreferredSeqno` entries. This also means we need to track these `write_unix_time` in memtable.
2) Compaction filter support for the new `kTypeValuePreferredSeqno` type for feature parity with other `kTypeValue` and equivalent types.
3) Stress test coverage for the feature
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12419
Test Plan: Added unit tests
Reviewed By: pdillinger
Differential Revision: D54920296
Pulled By: jowlyzhang
fbshipit-source-id: c8b43f7a7c465e569141770e93c748371ff1da9e
2024-03-14 22:44:55 +00:00
|
|
|
int timed_put_count = 0;
|
2015-11-06 15:03:30 +00:00
|
|
|
int delete_count = 0;
|
|
|
|
int single_delete_count = 0;
|
2016-08-16 15:16:04 +00:00
|
|
|
int delete_range_count = 0;
|
2015-11-06 15:03:30 +00:00
|
|
|
int merge_count = 0;
|
2016-09-12 21:14:40 +00:00
|
|
|
for (int i = 0; i < 2; ++i) {
|
|
|
|
Arena arena;
|
2024-03-22 20:40:42 +00:00
|
|
|
ScopedArenaPtr<InternalIterator> arena_iter_guard;
|
2016-11-19 22:14:35 +00:00
|
|
|
std::unique_ptr<InternalIterator> iter_guard;
|
|
|
|
InternalIterator* iter;
|
|
|
|
if (i == 0) {
|
Support returning write unix time in iterator property (#12428)
Summary:
This PR adds support to return data's approximate unix write time in the iterator property API. The general implementation is:
1) If the entry comes from a SST file, the sequence number to time mapping recorded in that file's table properties will be used to deduce the entry's write time from its sequence number. If no such recording is available, `std::numeric_limits<uint64_t>::max()` is returned to indicate the write time is unknown except if the entry's sequence number is zero, in which case, 0 is returned. This also means that even if `preclude_last_level_data_seconds` and `preserve_internal_time_seconds` can be toggled off between DB reopens, as long as the SST file's table property has the mapping available, the entry's write time can be deduced and returned.
2) If the entry comes from memtable, we will use the DB's sequence number to write time mapping to do similar things. A copy of the DB's seqno to write time mapping is kept in SuperVersion to allow iterators to have lock free access. This also means a new `SuperVersion` is installed each time DB's seqno to time mapping updates, which is originally proposed by Peter in https://github.com/facebook/rocksdb/issues/11928 . Similarly, if the feature is not enabled, `std::numeric_limits<uint64_t>::max()` is returned to indicate the write time is unknown.
Needed follow up:
1) The write time for `kTypeValuePreferredSeqno` should be special cased, where it's already specified by the user, so we can directly return it.
2) Flush job can be updated to use DB's seqno to time mapping copy in the SuperVersion.
3) Handle the case when `TimedPut` is called with a write time that is `std::numeric_limits<uint64_t>::max()`. We can make it a regular `Put`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12428
Test Plan: Added unit test
Reviewed By: pdillinger
Differential Revision: D54967067
Pulled By: jowlyzhang
fbshipit-source-id: c795b1b7ec142e09e53f2ed3461cf719833cb37a
2024-03-15 22:37:37 +00:00
|
|
|
iter = mem->NewIterator(ReadOptions(), /*seqno_to_time_mapping=*/nullptr,
|
Steps toward deprecating implicit prefix seek, related fixes (#13026)
Summary:
With some new use cases onboarding to prefix extractors/seek/filters, one of the risks is existing iterator code, e.g. for maintenance tasks, being unintentionally subject to prefix seek semantics. This is a longstanding known design flaw with prefix seek, and `prefix_same_as_start` and `auto_prefix_mode` were steps in the direction of making that obsolete. However, we can't just immediately set `total_order_seek` to true by default, because that would impact so much code instantly.
Here we add a new DB option, `prefix_seek_opt_in_only` that basically allows users to transition to the future behavior when they are ready. When set to true, all iterators will be treated as if `total_order_seek=true` and then the only ways to get prefix seek semantics are with `prefix_same_as_start` or `auto_prefix_mode`.
Related fixes / changes:
* Make sure that `prefix_same_as_start` and `auto_prefix_mode` are compatible with (or override) `total_order_seek` (depending on your interpretation).
* Fix a bug in which a new iterator after dynamically changing the prefix extractor might mix different prefix semantics between memtable and SSTs. Both should use the latest extractor semantics, which means iterators ignoring memtable prefix filters with an old extractor. And that means passing the latest prefix extractor to new memtable iterators that might use prefix seek. (Without the fix, the test added for this fails in many ways.)
Suggested follow-up:
* Investigate a FIXME where a MergeIteratorBuilder is created in db_impl.cc. No unit test detects a change in value that should impact correctness.
* Make memtable prefix bloom compatible with `auto_prefix_mode`, which might require involving the memtablereps because we don't know at iterator creation time (only seek time) whether an auto_prefix_mode seek will be a prefix seek.
* Add `prefix_same_as_start` testing to db_stress
Pull Request resolved: https://github.com/facebook/rocksdb/pull/13026
Test Plan:
tests updated, added. Add combination of `total_order_seek=true` and `auto_prefix_mode=true` to stress test. Ran `make blackbox_crash_test` for a long while.
Manually ran tests with `prefix_seek_opt_in_only=true` as default, looking for unexpected issues. I inspected most of the results and migrated many tests to be ready for such a change (but not all).
Reviewed By: ltamasi
Differential Revision: D63147378
Pulled By: pdillinger
fbshipit-source-id: 1f4477b730683d43b4be7e933338583702d3c25e
2024-09-20 22:54:19 +00:00
|
|
|
&arena, /*prefix_extractor=*/nullptr);
|
2024-03-22 20:40:42 +00:00
|
|
|
arena_iter_guard.reset(iter);
|
2016-11-19 22:14:35 +00:00
|
|
|
} else {
|
2018-11-28 23:26:56 +00:00
|
|
|
iter = mem->NewRangeTombstoneIterator(ReadOptions(),
|
2022-08-05 19:02:33 +00:00
|
|
|
kMaxSequenceNumber /* read_seq */,
|
|
|
|
false /* immutable_memtable */);
|
2016-11-19 22:14:35 +00:00
|
|
|
iter_guard.reset(iter);
|
|
|
|
}
|
2016-11-21 20:07:09 +00:00
|
|
|
if (iter == nullptr) {
|
|
|
|
continue;
|
|
|
|
}
|
2020-12-08 23:53:59 +00:00
|
|
|
EXPECT_OK(iter->status());
|
2016-09-12 21:14:40 +00:00
|
|
|
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
|
|
|
ParsedInternalKey ikey;
|
2017-07-24 18:28:20 +00:00
|
|
|
ikey.clear();
|
2020-10-28 17:11:13 +00:00
|
|
|
EXPECT_OK(ParseInternalKey(iter->key(), &ikey, true /* log_err_key */));
|
2016-09-12 21:14:40 +00:00
|
|
|
switch (ikey.type) {
|
|
|
|
case kTypeValue:
|
|
|
|
state.append("Put(");
|
|
|
|
state.append(ikey.user_key.ToString());
|
|
|
|
state.append(", ");
|
|
|
|
state.append(iter->value().ToString());
|
|
|
|
state.append(")");
|
|
|
|
count++;
|
|
|
|
put_count++;
|
|
|
|
break;
|
|
|
|
case kTypeDeletion:
|
|
|
|
state.append("Delete(");
|
|
|
|
state.append(ikey.user_key.ToString());
|
|
|
|
state.append(")");
|
|
|
|
count++;
|
|
|
|
delete_count++;
|
|
|
|
break;
|
|
|
|
case kTypeSingleDeletion:
|
|
|
|
state.append("SingleDelete(");
|
|
|
|
state.append(ikey.user_key.ToString());
|
|
|
|
state.append(")");
|
|
|
|
count++;
|
|
|
|
single_delete_count++;
|
|
|
|
break;
|
|
|
|
case kTypeRangeDeletion:
|
|
|
|
state.append("DeleteRange(");
|
|
|
|
state.append(ikey.user_key.ToString());
|
|
|
|
state.append(", ");
|
|
|
|
state.append(iter->value().ToString());
|
|
|
|
state.append(")");
|
|
|
|
count++;
|
|
|
|
delete_range_count++;
|
|
|
|
break;
|
|
|
|
case kTypeMerge:
|
|
|
|
state.append("Merge(");
|
|
|
|
state.append(ikey.user_key.ToString());
|
|
|
|
state.append(", ");
|
|
|
|
state.append(iter->value().ToString());
|
|
|
|
state.append(")");
|
|
|
|
count++;
|
|
|
|
merge_count++;
|
|
|
|
break;
|
Add initial support for TimedPut API (#12419)
Summary:
This PR adds support for `TimedPut` API. We introduced a new type `kTypeValuePreferredSeqno` for entries added to the DB via the `TimedPut` API.
The life cycle of such an entry on the write/flush/compaction paths are:
1) It is initially added to memtable as:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, write_unix_time}`
2) When it's flushed to L0 sst files, it's converted to:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, preferred_seqno}`
when we have easy access to the seqno to time mapping.
3) During compaction, if certain conditions are met, we swap in the `preferred_seqno` and the entry will become:
`<user_key, preferred_seqno, kTypeValue>: value`. This step helps fast track these entries to the cold tier if they are eligible after the sequence number swap.
On the read path:
A `kTypeValuePreferredSeqno` entry acts the same as a `kTypeValue` entry, the unix_write_time/preferred seqno part packed in value is completely ignored.
Needed follow ups:
1) The seqno to time mapping accessible in flush needs to be extended to cover the `write_unix_time` for possible `kTypeValuePreferredSeqno` entries. This also means we need to track these `write_unix_time` in memtable.
2) Compaction filter support for the new `kTypeValuePreferredSeqno` type for feature parity with other `kTypeValue` and equivalent types.
3) Stress test coverage for the feature
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12419
Test Plan: Added unit tests
Reviewed By: pdillinger
Differential Revision: D54920296
Pulled By: jowlyzhang
fbshipit-source-id: c8b43f7a7c465e569141770e93c748371ff1da9e
2024-03-14 22:44:55 +00:00
|
|
|
case kTypeValuePreferredSeqno: {
|
|
|
|
state.append("TimedPut(");
|
|
|
|
state.append(ikey.user_key.ToString());
|
|
|
|
state.append(", ");
|
|
|
|
auto [unpacked_value, unix_write_time] =
|
|
|
|
ParsePackedValueWithWriteTime(iter->value());
|
|
|
|
state.append(unpacked_value.ToString());
|
|
|
|
state.append(", ");
|
|
|
|
state.append(std::to_string(unix_write_time));
|
|
|
|
state.append(")");
|
|
|
|
count++;
|
|
|
|
timed_put_count++;
|
|
|
|
break;
|
|
|
|
}
|
2016-09-12 21:14:40 +00:00
|
|
|
default:
|
|
|
|
assert(false);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
state.append("@");
|
2022-05-06 20:03:58 +00:00
|
|
|
state.append(std::to_string(ikey.sequence));
|
2011-03-18 22:37:00 +00:00
|
|
|
}
|
2020-12-08 23:53:59 +00:00
|
|
|
EXPECT_OK(iter->status());
|
2011-03-18 22:37:00 +00:00
|
|
|
}
|
2020-11-17 01:58:59 +00:00
|
|
|
if (s.ok()) {
|
|
|
|
EXPECT_EQ(b->HasPut(), put_count > 0);
|
Add initial support for TimedPut API (#12419)
Summary:
This PR adds support for `TimedPut` API. We introduced a new type `kTypeValuePreferredSeqno` for entries added to the DB via the `TimedPut` API.
The life cycle of such an entry on the write/flush/compaction paths are:
1) It is initially added to memtable as:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, write_unix_time}`
2) When it's flushed to L0 sst files, it's converted to:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, preferred_seqno}`
when we have easy access to the seqno to time mapping.
3) During compaction, if certain conditions are met, we swap in the `preferred_seqno` and the entry will become:
`<user_key, preferred_seqno, kTypeValue>: value`. This step helps fast track these entries to the cold tier if they are eligible after the sequence number swap.
On the read path:
A `kTypeValuePreferredSeqno` entry acts the same as a `kTypeValue` entry, the unix_write_time/preferred seqno part packed in value is completely ignored.
Needed follow ups:
1) The seqno to time mapping accessible in flush needs to be extended to cover the `write_unix_time` for possible `kTypeValuePreferredSeqno` entries. This also means we need to track these `write_unix_time` in memtable.
2) Compaction filter support for the new `kTypeValuePreferredSeqno` type for feature parity with other `kTypeValue` and equivalent types.
3) Stress test coverage for the feature
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12419
Test Plan: Added unit tests
Reviewed By: pdillinger
Differential Revision: D54920296
Pulled By: jowlyzhang
fbshipit-source-id: c8b43f7a7c465e569141770e93c748371ff1da9e
2024-03-14 22:44:55 +00:00
|
|
|
EXPECT_EQ(b->HasTimedPut(), timed_put_count > 0);
|
2020-11-17 01:58:59 +00:00
|
|
|
EXPECT_EQ(b->HasDelete(), delete_count > 0);
|
|
|
|
EXPECT_EQ(b->HasSingleDelete(), single_delete_count > 0);
|
|
|
|
EXPECT_EQ(b->HasDeleteRange(), delete_range_count > 0);
|
|
|
|
EXPECT_EQ(b->HasMerge(), merge_count > 0);
|
|
|
|
if (count != WriteBatchInternal::Count(b)) {
|
|
|
|
state.append("CountMismatch()");
|
|
|
|
}
|
|
|
|
} else {
|
2013-08-14 23:32:46 +00:00
|
|
|
state.append(s.ToString());
|
2011-03-18 22:37:00 +00:00
|
|
|
}
|
2013-12-02 05:23:44 +00:00
|
|
|
delete mem->Unref();
|
2011-03-18 22:37:00 +00:00
|
|
|
return state;
|
|
|
|
}
|
|
|
|
|
2015-03-17 21:08:00 +00:00
|
|
|
class WriteBatchTest : public testing::Test {};
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2015-03-17 21:08:00 +00:00
|
|
|
TEST_F(WriteBatchTest, Empty) {
|
2011-03-18 22:37:00 +00:00
|
|
|
WriteBatch batch;
|
|
|
|
ASSERT_EQ("", PrintContents(&batch));
|
2019-09-09 18:22:28 +00:00
|
|
|
ASSERT_EQ(0u, WriteBatchInternal::Count(&batch));
|
|
|
|
ASSERT_EQ(0u, batch.Count());
|
2011-03-18 22:37:00 +00:00
|
|
|
}
|
|
|
|
|
2015-03-17 21:08:00 +00:00
|
|
|
TEST_F(WriteBatchTest, Multiple) {
|
2011-03-18 22:37:00 +00:00
|
|
|
WriteBatch batch;
|
2020-10-20 20:17:17 +00:00
|
|
|
ASSERT_OK(batch.Put(Slice("foo"), Slice("bar")));
|
|
|
|
ASSERT_OK(batch.Delete(Slice("box")));
|
|
|
|
ASSERT_OK(batch.DeleteRange(Slice("bar"), Slice("foo")));
|
|
|
|
ASSERT_OK(batch.Put(Slice("baz"), Slice("boo")));
|
2011-03-18 22:37:00 +00:00
|
|
|
WriteBatchInternal::SetSequence(&batch, 100);
|
2012-11-06 20:02:18 +00:00
|
|
|
ASSERT_EQ(100U, WriteBatchInternal::Sequence(&batch));
|
2019-09-09 18:22:28 +00:00
|
|
|
ASSERT_EQ(4u, WriteBatchInternal::Count(&batch));
|
2016-08-16 15:16:04 +00:00
|
|
|
ASSERT_EQ(
|
|
|
|
"Put(baz, boo)@103"
|
|
|
|
"Delete(box)@101"
|
2016-09-12 21:14:40 +00:00
|
|
|
"Put(foo, bar)@100"
|
|
|
|
"DeleteRange(bar, foo)@102",
|
2016-08-16 15:16:04 +00:00
|
|
|
PrintContents(&batch));
|
2019-09-09 18:22:28 +00:00
|
|
|
ASSERT_EQ(4u, batch.Count());
|
2011-03-18 22:37:00 +00:00
|
|
|
}
|
|
|
|
|
2015-03-17 21:08:00 +00:00
|
|
|
TEST_F(WriteBatchTest, Corruption) {
|
2011-03-18 22:37:00 +00:00
|
|
|
WriteBatch batch;
|
2020-10-20 20:17:17 +00:00
|
|
|
ASSERT_OK(batch.Put(Slice("foo"), Slice("bar")));
|
|
|
|
ASSERT_OK(batch.Delete(Slice("box")));
|
2011-03-18 22:37:00 +00:00
|
|
|
WriteBatchInternal::SetSequence(&batch, 200);
|
|
|
|
Slice contents = WriteBatchInternal::Contents(&batch);
|
2020-10-20 20:17:17 +00:00
|
|
|
ASSERT_OK(WriteBatchInternal::SetContents(
|
|
|
|
&batch, Slice(contents.data(), contents.size() - 1)));
|
2022-11-02 21:34:24 +00:00
|
|
|
ASSERT_EQ(
|
|
|
|
"Put(foo, bar)@200"
|
|
|
|
"Corruption: bad WriteBatch Delete",
|
|
|
|
PrintContents(&batch));
|
2011-03-18 22:37:00 +00:00
|
|
|
}
|
|
|
|
|
2015-03-17 21:08:00 +00:00
|
|
|
TEST_F(WriteBatchTest, Append) {
|
2012-03-09 00:23:21 +00:00
|
|
|
WriteBatch b1, b2;
|
|
|
|
WriteBatchInternal::SetSequence(&b1, 200);
|
|
|
|
WriteBatchInternal::SetSequence(&b2, 300);
|
2020-10-20 20:17:17 +00:00
|
|
|
ASSERT_OK(WriteBatchInternal::Append(&b1, &b2));
|
2022-11-02 21:34:24 +00:00
|
|
|
ASSERT_EQ("", PrintContents(&b1));
|
2019-09-09 18:22:28 +00:00
|
|
|
ASSERT_EQ(0u, b1.Count());
|
2020-10-20 20:17:17 +00:00
|
|
|
ASSERT_OK(b2.Put("a", "va"));
|
|
|
|
ASSERT_OK(WriteBatchInternal::Append(&b1, &b2));
|
2022-11-02 21:34:24 +00:00
|
|
|
ASSERT_EQ("Put(a, va)@200", PrintContents(&b1));
|
2019-09-09 18:22:28 +00:00
|
|
|
ASSERT_EQ(1u, b1.Count());
|
2012-03-09 00:23:21 +00:00
|
|
|
b2.Clear();
|
2020-10-20 20:17:17 +00:00
|
|
|
ASSERT_OK(b2.Put("b", "vb"));
|
|
|
|
ASSERT_OK(WriteBatchInternal::Append(&b1, &b2));
|
2022-11-02 21:34:24 +00:00
|
|
|
ASSERT_EQ(
|
|
|
|
"Put(a, va)@200"
|
|
|
|
"Put(b, vb)@201",
|
|
|
|
PrintContents(&b1));
|
2019-09-09 18:22:28 +00:00
|
|
|
ASSERT_EQ(2u, b1.Count());
|
2020-10-20 20:17:17 +00:00
|
|
|
ASSERT_OK(b2.Delete("foo"));
|
|
|
|
ASSERT_OK(WriteBatchInternal::Append(&b1, &b2));
|
2022-11-02 21:34:24 +00:00
|
|
|
ASSERT_EQ(
|
|
|
|
"Put(a, va)@200"
|
|
|
|
"Put(b, vb)@202"
|
|
|
|
"Put(b, vb)@201"
|
|
|
|
"Delete(foo)@203",
|
|
|
|
PrintContents(&b1));
|
2019-09-09 18:22:28 +00:00
|
|
|
ASSERT_EQ(4u, b1.Count());
|
Add facility to write only a portion of WriteBatch to WAL
Summary:
When constructing a write batch a client may now call MarkWalTerminationPoint() on that batch. No batch operations after this call will be added written to the WAL but will still be inserted into the Memtable. This facility is used to remove one of the three WriteImpl calls in 2PC transactions. This produces a ~1% perf improvement.
```
RocksDB - unoptimized 2pc, sync_binlog=1, disable_2pc=off
INFO 2016-08-31 14:30:38,814 [main]: REQUEST PHASE COMPLETED. 75000000 requests done in 2619 seconds. Requests/second = 28628
RocksDB - optimized 2pc , sync_binlog=1, disable_2pc=off
INFO 2016-08-31 16:26:59,442 [main]: REQUEST PHASE COMPLETED. 75000000 requests done in 2581 seconds. Requests/second = 29054
```
Test Plan: Two unit tests added.
Reviewers: sdong, yiwu, IslamAbdelRahman
Reviewed By: yiwu
Subscribers: hermanlee4, dhruba, andrewkr
Differential Revision: https://reviews.facebook.net/D64599
2016-10-07 18:31:26 +00:00
|
|
|
b2.Clear();
|
2020-10-20 20:17:17 +00:00
|
|
|
ASSERT_OK(b2.Put("c", "cc"));
|
|
|
|
ASSERT_OK(b2.Put("d", "dd"));
|
Add facility to write only a portion of WriteBatch to WAL
Summary:
When constructing a write batch a client may now call MarkWalTerminationPoint() on that batch. No batch operations after this call will be added written to the WAL but will still be inserted into the Memtable. This facility is used to remove one of the three WriteImpl calls in 2PC transactions. This produces a ~1% perf improvement.
```
RocksDB - unoptimized 2pc, sync_binlog=1, disable_2pc=off
INFO 2016-08-31 14:30:38,814 [main]: REQUEST PHASE COMPLETED. 75000000 requests done in 2619 seconds. Requests/second = 28628
RocksDB - optimized 2pc , sync_binlog=1, disable_2pc=off
INFO 2016-08-31 16:26:59,442 [main]: REQUEST PHASE COMPLETED. 75000000 requests done in 2581 seconds. Requests/second = 29054
```
Test Plan: Two unit tests added.
Reviewers: sdong, yiwu, IslamAbdelRahman
Reviewed By: yiwu
Subscribers: hermanlee4, dhruba, andrewkr
Differential Revision: https://reviews.facebook.net/D64599
2016-10-07 18:31:26 +00:00
|
|
|
b2.MarkWalTerminationPoint();
|
2020-10-20 20:17:17 +00:00
|
|
|
ASSERT_OK(b2.Put("e", "ee"));
|
|
|
|
ASSERT_OK(WriteBatchInternal::Append(&b1, &b2, /*wal only*/ true));
|
Add facility to write only a portion of WriteBatch to WAL
Summary:
When constructing a write batch a client may now call MarkWalTerminationPoint() on that batch. No batch operations after this call will be added written to the WAL but will still be inserted into the Memtable. This facility is used to remove one of the three WriteImpl calls in 2PC transactions. This produces a ~1% perf improvement.
```
RocksDB - unoptimized 2pc, sync_binlog=1, disable_2pc=off
INFO 2016-08-31 14:30:38,814 [main]: REQUEST PHASE COMPLETED. 75000000 requests done in 2619 seconds. Requests/second = 28628
RocksDB - optimized 2pc , sync_binlog=1, disable_2pc=off
INFO 2016-08-31 16:26:59,442 [main]: REQUEST PHASE COMPLETED. 75000000 requests done in 2581 seconds. Requests/second = 29054
```
Test Plan: Two unit tests added.
Reviewers: sdong, yiwu, IslamAbdelRahman
Reviewed By: yiwu
Subscribers: hermanlee4, dhruba, andrewkr
Differential Revision: https://reviews.facebook.net/D64599
2016-10-07 18:31:26 +00:00
|
|
|
ASSERT_EQ(
|
|
|
|
"Put(a, va)@200"
|
|
|
|
"Put(b, vb)@202"
|
|
|
|
"Put(b, vb)@201"
|
|
|
|
"Put(c, cc)@204"
|
|
|
|
"Put(d, dd)@205"
|
|
|
|
"Delete(foo)@203",
|
|
|
|
PrintContents(&b1));
|
2019-09-09 18:22:28 +00:00
|
|
|
ASSERT_EQ(6u, b1.Count());
|
Add facility to write only a portion of WriteBatch to WAL
Summary:
When constructing a write batch a client may now call MarkWalTerminationPoint() on that batch. No batch operations after this call will be added written to the WAL but will still be inserted into the Memtable. This facility is used to remove one of the three WriteImpl calls in 2PC transactions. This produces a ~1% perf improvement.
```
RocksDB - unoptimized 2pc, sync_binlog=1, disable_2pc=off
INFO 2016-08-31 14:30:38,814 [main]: REQUEST PHASE COMPLETED. 75000000 requests done in 2619 seconds. Requests/second = 28628
RocksDB - optimized 2pc , sync_binlog=1, disable_2pc=off
INFO 2016-08-31 16:26:59,442 [main]: REQUEST PHASE COMPLETED. 75000000 requests done in 2581 seconds. Requests/second = 29054
```
Test Plan: Two unit tests added.
Reviewers: sdong, yiwu, IslamAbdelRahman
Reviewed By: yiwu
Subscribers: hermanlee4, dhruba, andrewkr
Differential Revision: https://reviews.facebook.net/D64599
2016-10-07 18:31:26 +00:00
|
|
|
ASSERT_EQ(
|
|
|
|
"Put(c, cc)@0"
|
|
|
|
"Put(d, dd)@1"
|
|
|
|
"Put(e, ee)@2",
|
|
|
|
PrintContents(&b2));
|
2019-09-09 18:22:28 +00:00
|
|
|
ASSERT_EQ(3u, b2.Count());
|
2012-03-09 00:23:21 +00:00
|
|
|
}
|
|
|
|
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 18:42:56 +00:00
|
|
|
TEST_F(WriteBatchTest, SingleDeletion) {
|
|
|
|
WriteBatch batch;
|
|
|
|
WriteBatchInternal::SetSequence(&batch, 100);
|
|
|
|
ASSERT_EQ("", PrintContents(&batch));
|
2019-09-09 18:22:28 +00:00
|
|
|
ASSERT_EQ(0u, batch.Count());
|
2020-10-20 20:17:17 +00:00
|
|
|
ASSERT_OK(batch.Put("a", "va"));
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 18:42:56 +00:00
|
|
|
ASSERT_EQ("Put(a, va)@100", PrintContents(&batch));
|
2019-09-09 18:22:28 +00:00
|
|
|
ASSERT_EQ(1u, batch.Count());
|
2020-10-20 20:17:17 +00:00
|
|
|
ASSERT_OK(batch.SingleDelete("a"));
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 18:42:56 +00:00
|
|
|
ASSERT_EQ(
|
|
|
|
"SingleDelete(a)@101"
|
|
|
|
"Put(a, va)@100",
|
|
|
|
PrintContents(&batch));
|
2019-09-09 18:22:28 +00:00
|
|
|
ASSERT_EQ(2u, batch.Count());
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 18:42:56 +00:00
|
|
|
}
|
|
|
|
|
2023-05-27 01:15:14 +00:00
|
|
|
TEST_F(WriteBatchTest, OwnershipTransfer) {
|
|
|
|
Random rnd(301);
|
|
|
|
WriteBatch put_batch;
|
|
|
|
ASSERT_OK(put_batch.Put(rnd.RandomString(16) /* key */,
|
|
|
|
rnd.RandomString(1024) /* value */));
|
|
|
|
|
|
|
|
// (1) Verify `Release()` transfers string data ownership
|
|
|
|
const char* expected_data = put_batch.Data().data();
|
|
|
|
std::string batch_str = put_batch.Release();
|
|
|
|
ASSERT_EQ(expected_data, batch_str.data());
|
|
|
|
|
|
|
|
// (2) Verify constructor transfers string data ownership
|
|
|
|
WriteBatch move_batch(std::move(batch_str));
|
|
|
|
ASSERT_EQ(expected_data, move_batch.Data().data());
|
|
|
|
}
|
|
|
|
|
2013-08-22 01:27:48 +00:00
|
|
|
namespace {
|
2022-11-02 21:34:24 +00:00
|
|
|
struct TestHandler : public WriteBatch::Handler {
|
|
|
|
std::string seen;
|
|
|
|
Status PutCF(uint32_t column_family_id, const Slice& key,
|
|
|
|
const Slice& value) override {
|
|
|
|
if (column_family_id == 0) {
|
|
|
|
seen += "Put(" + key.ToString() + ", " + value.ToString() + ")";
|
|
|
|
} else {
|
|
|
|
seen += "PutCF(" + std::to_string(column_family_id) + ", " +
|
|
|
|
key.ToString() + ", " + value.ToString() + ")";
|
Modification of WriteBatch to support two phase commit
Summary: Adds three new WriteBatch data types: Prepare(xid), Commit(xid), Rollback(xid). Prepare(xid) should precede the (single) operation to which is applies. There can obviously be multiple Prepare(xid) markers. There should only be one Rollback(xid) or Commit(xid) marker yet not both. None of this logic is currently enforced and will most likely be implemented further up such as in the memtableinserter. All three markers are similar to PutLogData in that they are writebatch meta-data, ie stored but not counted. All three markers differ from PutLogData in that they will actually be written to disk. As for WriteBatchWithIndex, Prepare, Commit, Rollback are all implemented just as PutLogData and none are tested just as PutLogData.
Test Plan: single unit test in write_batch_test.
Reviewers: hermanlee4, sdong, anthony
Subscribers: leveldb, dhruba, vasilep, andrewkr
Differential Revision: https://reviews.facebook.net/D57867
2016-04-08 06:35:51 +00:00
|
|
|
}
|
2022-11-02 21:34:24 +00:00
|
|
|
return Status::OK();
|
|
|
|
}
|
Add initial support for TimedPut API (#12419)
Summary:
This PR adds support for `TimedPut` API. We introduced a new type `kTypeValuePreferredSeqno` for entries added to the DB via the `TimedPut` API.
The life cycle of such an entry on the write/flush/compaction paths are:
1) It is initially added to memtable as:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, write_unix_time}`
2) When it's flushed to L0 sst files, it's converted to:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, preferred_seqno}`
when we have easy access to the seqno to time mapping.
3) During compaction, if certain conditions are met, we swap in the `preferred_seqno` and the entry will become:
`<user_key, preferred_seqno, kTypeValue>: value`. This step helps fast track these entries to the cold tier if they are eligible after the sequence number swap.
On the read path:
A `kTypeValuePreferredSeqno` entry acts the same as a `kTypeValue` entry, the unix_write_time/preferred seqno part packed in value is completely ignored.
Needed follow ups:
1) The seqno to time mapping accessible in flush needs to be extended to cover the `write_unix_time` for possible `kTypeValuePreferredSeqno` entries. This also means we need to track these `write_unix_time` in memtable.
2) Compaction filter support for the new `kTypeValuePreferredSeqno` type for feature parity with other `kTypeValue` and equivalent types.
3) Stress test coverage for the feature
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12419
Test Plan: Added unit tests
Reviewed By: pdillinger
Differential Revision: D54920296
Pulled By: jowlyzhang
fbshipit-source-id: c8b43f7a7c465e569141770e93c748371ff1da9e
2024-03-14 22:44:55 +00:00
|
|
|
Status TimedPutCF(uint32_t column_family_id, const Slice& key,
|
|
|
|
const Slice& value, uint64_t unix_write_time) override {
|
|
|
|
if (column_family_id == 0) {
|
|
|
|
seen += "TimedPut(" + key.ToString() + ", " + value.ToString() + ", " +
|
|
|
|
std::to_string(unix_write_time) + ")";
|
|
|
|
} else {
|
|
|
|
seen += "TimedPutCF(" + std::to_string(column_family_id) + ", " +
|
|
|
|
key.ToString() + ", " + value.ToString() + ", " +
|
|
|
|
std::to_string(unix_write_time) + ")";
|
|
|
|
}
|
|
|
|
return Status::OK();
|
|
|
|
}
|
2023-11-07 00:52:51 +00:00
|
|
|
Status PutEntityCF(uint32_t column_family_id, const Slice& key,
|
|
|
|
const Slice& entity) override {
|
|
|
|
std::ostringstream oss;
|
|
|
|
Status s = WideColumnsHelper::DumpSliceAsWideColumns(entity, oss, false);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
if (column_family_id == 0) {
|
|
|
|
seen += "PutEntity(" + key.ToString() + ", " + oss.str() + ")";
|
|
|
|
} else {
|
|
|
|
seen += "PutEntityCF(" + std::to_string(column_family_id) + ", " +
|
|
|
|
key.ToString() + ", " + oss.str() + ")";
|
|
|
|
}
|
|
|
|
return Status::OK();
|
|
|
|
}
|
2022-11-02 21:34:24 +00:00
|
|
|
Status DeleteCF(uint32_t column_family_id, const Slice& key) override {
|
|
|
|
if (column_family_id == 0) {
|
|
|
|
seen += "Delete(" + key.ToString() + ")";
|
|
|
|
} else {
|
|
|
|
seen += "DeleteCF(" + std::to_string(column_family_id) + ", " +
|
|
|
|
key.ToString() + ")";
|
2017-09-18 21:36:53 +00:00
|
|
|
}
|
2022-11-02 21:34:24 +00:00
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
Status SingleDeleteCF(uint32_t column_family_id, const Slice& key) override {
|
|
|
|
if (column_family_id == 0) {
|
|
|
|
seen += "SingleDelete(" + key.ToString() + ")";
|
|
|
|
} else {
|
|
|
|
seen += "SingleDeleteCF(" + std::to_string(column_family_id) + ", " +
|
|
|
|
key.ToString() + ")";
|
Modification of WriteBatch to support two phase commit
Summary: Adds three new WriteBatch data types: Prepare(xid), Commit(xid), Rollback(xid). Prepare(xid) should precede the (single) operation to which is applies. There can obviously be multiple Prepare(xid) markers. There should only be one Rollback(xid) or Commit(xid) marker yet not both. None of this logic is currently enforced and will most likely be implemented further up such as in the memtableinserter. All three markers are similar to PutLogData in that they are writebatch meta-data, ie stored but not counted. All three markers differ from PutLogData in that they will actually be written to disk. As for WriteBatchWithIndex, Prepare, Commit, Rollback are all implemented just as PutLogData and none are tested just as PutLogData.
Test Plan: single unit test in write_batch_test.
Reviewers: hermanlee4, sdong, anthony
Subscribers: leveldb, dhruba, vasilep, andrewkr
Differential Revision: https://reviews.facebook.net/D57867
2016-04-08 06:35:51 +00:00
|
|
|
}
|
2022-11-02 21:34:24 +00:00
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
Status DeleteRangeCF(uint32_t column_family_id, const Slice& begin_key,
|
|
|
|
const Slice& end_key) override {
|
|
|
|
if (column_family_id == 0) {
|
|
|
|
seen += "DeleteRange(" + begin_key.ToString() + ", " +
|
|
|
|
end_key.ToString() + ")";
|
|
|
|
} else {
|
|
|
|
seen += "DeleteRangeCF(" + std::to_string(column_family_id) + ", " +
|
|
|
|
begin_key.ToString() + ", " + end_key.ToString() + ")";
|
2021-12-10 19:03:39 +00:00
|
|
|
}
|
2022-11-02 21:34:24 +00:00
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
Status MergeCF(uint32_t column_family_id, const Slice& key,
|
|
|
|
const Slice& value) override {
|
|
|
|
if (column_family_id == 0) {
|
|
|
|
seen += "Merge(" + key.ToString() + ", " + value.ToString() + ")";
|
|
|
|
} else {
|
|
|
|
seen += "MergeCF(" + std::to_string(column_family_id) + ", " +
|
|
|
|
key.ToString() + ", " + value.ToString() + ")";
|
Modification of WriteBatch to support two phase commit
Summary: Adds three new WriteBatch data types: Prepare(xid), Commit(xid), Rollback(xid). Prepare(xid) should precede the (single) operation to which is applies. There can obviously be multiple Prepare(xid) markers. There should only be one Rollback(xid) or Commit(xid) marker yet not both. None of this logic is currently enforced and will most likely be implemented further up such as in the memtableinserter. All three markers are similar to PutLogData in that they are writebatch meta-data, ie stored but not counted. All three markers differ from PutLogData in that they will actually be written to disk. As for WriteBatchWithIndex, Prepare, Commit, Rollback are all implemented just as PutLogData and none are tested just as PutLogData.
Test Plan: single unit test in write_batch_test.
Reviewers: hermanlee4, sdong, anthony
Subscribers: leveldb, dhruba, vasilep, andrewkr
Differential Revision: https://reviews.facebook.net/D57867
2016-04-08 06:35:51 +00:00
|
|
|
}
|
2022-11-02 21:34:24 +00:00
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
void LogData(const Slice& blob) override {
|
|
|
|
seen += "LogData(" + blob.ToString() + ")";
|
|
|
|
}
|
|
|
|
Status MarkBeginPrepare(bool unprepare) override {
|
|
|
|
seen +=
|
|
|
|
"MarkBeginPrepare(" + std::string(unprepare ? "true" : "false") + ")";
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
Status MarkEndPrepare(const Slice& xid) override {
|
|
|
|
seen += "MarkEndPrepare(" + xid.ToString() + ")";
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
Status MarkNoop(bool empty_batch) override {
|
|
|
|
seen += "MarkNoop(" + std::string(empty_batch ? "true" : "false") + ")";
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
Status MarkCommit(const Slice& xid) override {
|
|
|
|
seen += "MarkCommit(" + xid.ToString() + ")";
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
Status MarkCommitWithTimestamp(const Slice& xid, const Slice& ts) override {
|
|
|
|
seen += "MarkCommitWithTimestamp(" + xid.ToString() + ", " +
|
|
|
|
ts.ToString(true) + ")";
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
Status MarkRollback(const Slice& xid) override {
|
|
|
|
seen += "MarkRollback(" + xid.ToString() + ")";
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
};
|
|
|
|
} // anonymous namespace
|
2013-08-22 01:27:48 +00:00
|
|
|
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 18:42:56 +00:00
|
|
|
TEST_F(WriteBatchTest, PutNotImplemented) {
|
2014-12-04 20:01:55 +00:00
|
|
|
WriteBatch batch;
|
2020-10-20 20:17:17 +00:00
|
|
|
ASSERT_OK(batch.Put(Slice("k1"), Slice("v1")));
|
2019-09-09 18:22:28 +00:00
|
|
|
ASSERT_EQ(1u, batch.Count());
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 18:42:56 +00:00
|
|
|
ASSERT_EQ("Put(k1, v1)@0", PrintContents(&batch));
|
2014-12-04 20:01:55 +00:00
|
|
|
|
|
|
|
WriteBatch::Handler handler;
|
|
|
|
ASSERT_OK(batch.Iterate(&handler));
|
|
|
|
}
|
|
|
|
|
Add initial support for TimedPut API (#12419)
Summary:
This PR adds support for `TimedPut` API. We introduced a new type `kTypeValuePreferredSeqno` for entries added to the DB via the `TimedPut` API.
The life cycle of such an entry on the write/flush/compaction paths are:
1) It is initially added to memtable as:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, write_unix_time}`
2) When it's flushed to L0 sst files, it's converted to:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, preferred_seqno}`
when we have easy access to the seqno to time mapping.
3) During compaction, if certain conditions are met, we swap in the `preferred_seqno` and the entry will become:
`<user_key, preferred_seqno, kTypeValue>: value`. This step helps fast track these entries to the cold tier if they are eligible after the sequence number swap.
On the read path:
A `kTypeValuePreferredSeqno` entry acts the same as a `kTypeValue` entry, the unix_write_time/preferred seqno part packed in value is completely ignored.
Needed follow ups:
1) The seqno to time mapping accessible in flush needs to be extended to cover the `write_unix_time` for possible `kTypeValuePreferredSeqno` entries. This also means we need to track these `write_unix_time` in memtable.
2) Compaction filter support for the new `kTypeValuePreferredSeqno` type for feature parity with other `kTypeValue` and equivalent types.
3) Stress test coverage for the feature
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12419
Test Plan: Added unit tests
Reviewed By: pdillinger
Differential Revision: D54920296
Pulled By: jowlyzhang
fbshipit-source-id: c8b43f7a7c465e569141770e93c748371ff1da9e
2024-03-14 22:44:55 +00:00
|
|
|
TEST_F(WriteBatchTest, TimedPutNotImplemented) {
|
|
|
|
WriteBatch batch;
|
|
|
|
ASSERT_OK(
|
2024-03-21 17:00:15 +00:00
|
|
|
batch.TimedPut(0, Slice("k1"), Slice("v1"), /*write_unix_time=*/30));
|
Add initial support for TimedPut API (#12419)
Summary:
This PR adds support for `TimedPut` API. We introduced a new type `kTypeValuePreferredSeqno` for entries added to the DB via the `TimedPut` API.
The life cycle of such an entry on the write/flush/compaction paths are:
1) It is initially added to memtable as:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, write_unix_time}`
2) When it's flushed to L0 sst files, it's converted to:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, preferred_seqno}`
when we have easy access to the seqno to time mapping.
3) During compaction, if certain conditions are met, we swap in the `preferred_seqno` and the entry will become:
`<user_key, preferred_seqno, kTypeValue>: value`. This step helps fast track these entries to the cold tier if they are eligible after the sequence number swap.
On the read path:
A `kTypeValuePreferredSeqno` entry acts the same as a `kTypeValue` entry, the unix_write_time/preferred seqno part packed in value is completely ignored.
Needed follow ups:
1) The seqno to time mapping accessible in flush needs to be extended to cover the `write_unix_time` for possible `kTypeValuePreferredSeqno` entries. This also means we need to track these `write_unix_time` in memtable.
2) Compaction filter support for the new `kTypeValuePreferredSeqno` type for feature parity with other `kTypeValue` and equivalent types.
3) Stress test coverage for the feature
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12419
Test Plan: Added unit tests
Reviewed By: pdillinger
Differential Revision: D54920296
Pulled By: jowlyzhang
fbshipit-source-id: c8b43f7a7c465e569141770e93c748371ff1da9e
2024-03-14 22:44:55 +00:00
|
|
|
ASSERT_EQ(1u, batch.Count());
|
|
|
|
ASSERT_EQ("TimedPut(k1, v1, 30)@0", PrintContents(&batch));
|
|
|
|
|
|
|
|
WriteBatch::Handler handler;
|
|
|
|
ASSERT_TRUE(batch.Iterate(&handler).IsInvalidArgument());
|
2024-03-21 17:00:15 +00:00
|
|
|
|
|
|
|
batch.Clear();
|
|
|
|
ASSERT_OK(
|
|
|
|
batch.TimedPut(0, Slice("k1"), Slice("v1"),
|
|
|
|
/*write_unix_time=*/std::numeric_limits<uint64_t>::max()));
|
|
|
|
ASSERT_EQ(1u, batch.Count());
|
|
|
|
ASSERT_EQ("Put(k1, v1)@0", PrintContents(&batch));
|
Add initial support for TimedPut API (#12419)
Summary:
This PR adds support for `TimedPut` API. We introduced a new type `kTypeValuePreferredSeqno` for entries added to the DB via the `TimedPut` API.
The life cycle of such an entry on the write/flush/compaction paths are:
1) It is initially added to memtable as:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, write_unix_time}`
2) When it's flushed to L0 sst files, it's converted to:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, preferred_seqno}`
when we have easy access to the seqno to time mapping.
3) During compaction, if certain conditions are met, we swap in the `preferred_seqno` and the entry will become:
`<user_key, preferred_seqno, kTypeValue>: value`. This step helps fast track these entries to the cold tier if they are eligible after the sequence number swap.
On the read path:
A `kTypeValuePreferredSeqno` entry acts the same as a `kTypeValue` entry, the unix_write_time/preferred seqno part packed in value is completely ignored.
Needed follow ups:
1) The seqno to time mapping accessible in flush needs to be extended to cover the `write_unix_time` for possible `kTypeValuePreferredSeqno` entries. This also means we need to track these `write_unix_time` in memtable.
2) Compaction filter support for the new `kTypeValuePreferredSeqno` type for feature parity with other `kTypeValue` and equivalent types.
3) Stress test coverage for the feature
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12419
Test Plan: Added unit tests
Reviewed By: pdillinger
Differential Revision: D54920296
Pulled By: jowlyzhang
fbshipit-source-id: c8b43f7a7c465e569141770e93c748371ff1da9e
2024-03-14 22:44:55 +00:00
|
|
|
}
|
|
|
|
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 18:42:56 +00:00
|
|
|
TEST_F(WriteBatchTest, DeleteNotImplemented) {
|
2014-12-04 20:01:55 +00:00
|
|
|
WriteBatch batch;
|
2020-10-20 20:17:17 +00:00
|
|
|
ASSERT_OK(batch.Delete(Slice("k2")));
|
2019-09-09 18:22:28 +00:00
|
|
|
ASSERT_EQ(1u, batch.Count());
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 18:42:56 +00:00
|
|
|
ASSERT_EQ("Delete(k2)@0", PrintContents(&batch));
|
2014-12-04 20:01:55 +00:00
|
|
|
|
|
|
|
WriteBatch::Handler handler;
|
|
|
|
ASSERT_OK(batch.Iterate(&handler));
|
|
|
|
}
|
|
|
|
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 18:42:56 +00:00
|
|
|
TEST_F(WriteBatchTest, SingleDeleteNotImplemented) {
|
2014-12-04 20:01:55 +00:00
|
|
|
WriteBatch batch;
|
2020-10-20 20:17:17 +00:00
|
|
|
ASSERT_OK(batch.SingleDelete(Slice("k2")));
|
2019-09-09 18:22:28 +00:00
|
|
|
ASSERT_EQ(1u, batch.Count());
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 18:42:56 +00:00
|
|
|
ASSERT_EQ("SingleDelete(k2)@0", PrintContents(&batch));
|
|
|
|
|
|
|
|
WriteBatch::Handler handler;
|
|
|
|
ASSERT_OK(batch.Iterate(&handler));
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(WriteBatchTest, MergeNotImplemented) {
|
|
|
|
WriteBatch batch;
|
2020-10-20 20:17:17 +00:00
|
|
|
ASSERT_OK(batch.Merge(Slice("foo"), Slice("bar")));
|
2019-09-09 18:22:28 +00:00
|
|
|
ASSERT_EQ(1u, batch.Count());
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 18:42:56 +00:00
|
|
|
ASSERT_EQ("Merge(foo, bar)@0", PrintContents(&batch));
|
2014-12-04 20:01:55 +00:00
|
|
|
|
|
|
|
WriteBatch::Handler handler;
|
|
|
|
ASSERT_OK(batch.Iterate(&handler));
|
|
|
|
}
|
|
|
|
|
2020-11-17 01:58:59 +00:00
|
|
|
TEST_F(WriteBatchTest, MergeWithoutOperatorInsertionFailure) {
|
|
|
|
WriteBatch batch;
|
|
|
|
ASSERT_OK(batch.Merge(Slice("foo"), Slice("bar")));
|
|
|
|
ASSERT_EQ(1u, batch.Count());
|
|
|
|
ASSERT_EQ(
|
|
|
|
"Invalid argument: Merge requires `ColumnFamilyOptions::merge_operator "
|
|
|
|
"!= nullptr`",
|
|
|
|
PrintContents(&batch, false /* merge_operator_supported */));
|
|
|
|
}
|
|
|
|
|
2015-03-17 21:08:00 +00:00
|
|
|
TEST_F(WriteBatchTest, Blob) {
|
2013-08-14 23:32:46 +00:00
|
|
|
WriteBatch batch;
|
2020-10-20 20:17:17 +00:00
|
|
|
ASSERT_OK(batch.Put(Slice("k1"), Slice("v1")));
|
|
|
|
ASSERT_OK(batch.Put(Slice("k2"), Slice("v2")));
|
|
|
|
ASSERT_OK(batch.Put(Slice("k3"), Slice("v3")));
|
|
|
|
ASSERT_OK(batch.PutLogData(Slice("blob1")));
|
|
|
|
ASSERT_OK(batch.Delete(Slice("k2")));
|
|
|
|
ASSERT_OK(batch.SingleDelete(Slice("k3")));
|
|
|
|
ASSERT_OK(batch.PutLogData(Slice("blob2")));
|
|
|
|
ASSERT_OK(batch.Merge(Slice("foo"), Slice("bar")));
|
2019-09-09 18:22:28 +00:00
|
|
|
ASSERT_EQ(6u, batch.Count());
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 18:42:56 +00:00
|
|
|
ASSERT_EQ(
|
|
|
|
"Merge(foo, bar)@5"
|
|
|
|
"Put(k1, v1)@0"
|
|
|
|
"Delete(k2)@3"
|
|
|
|
"Put(k2, v2)@1"
|
|
|
|
"SingleDelete(k3)@4"
|
|
|
|
"Put(k3, v3)@2",
|
|
|
|
PrintContents(&batch));
|
2013-08-14 23:32:46 +00:00
|
|
|
|
2013-08-22 01:27:48 +00:00
|
|
|
TestHandler handler;
|
2020-10-20 20:17:17 +00:00
|
|
|
ASSERT_OK(batch.Iterate(&handler));
|
2013-08-22 01:27:48 +00:00
|
|
|
ASSERT_EQ(
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 18:42:56 +00:00
|
|
|
"Put(k1, v1)"
|
|
|
|
"Put(k2, v2)"
|
|
|
|
"Put(k3, v3)"
|
|
|
|
"LogData(blob1)"
|
|
|
|
"Delete(k2)"
|
|
|
|
"SingleDelete(k3)"
|
|
|
|
"LogData(blob2)"
|
|
|
|
"Merge(foo, bar)",
|
|
|
|
handler.seen);
|
2013-08-22 01:27:48 +00:00
|
|
|
}
|
|
|
|
|
Modification of WriteBatch to support two phase commit
Summary: Adds three new WriteBatch data types: Prepare(xid), Commit(xid), Rollback(xid). Prepare(xid) should precede the (single) operation to which is applies. There can obviously be multiple Prepare(xid) markers. There should only be one Rollback(xid) or Commit(xid) marker yet not both. None of this logic is currently enforced and will most likely be implemented further up such as in the memtableinserter. All three markers are similar to PutLogData in that they are writebatch meta-data, ie stored but not counted. All three markers differ from PutLogData in that they will actually be written to disk. As for WriteBatchWithIndex, Prepare, Commit, Rollback are all implemented just as PutLogData and none are tested just as PutLogData.
Test Plan: single unit test in write_batch_test.
Reviewers: hermanlee4, sdong, anthony
Subscribers: leveldb, dhruba, vasilep, andrewkr
Differential Revision: https://reviews.facebook.net/D57867
2016-04-08 06:35:51 +00:00
|
|
|
TEST_F(WriteBatchTest, PrepareCommit) {
|
|
|
|
WriteBatch batch;
|
2020-10-20 20:17:17 +00:00
|
|
|
ASSERT_OK(WriteBatchInternal::InsertNoop(&batch));
|
|
|
|
ASSERT_OK(batch.Put(Slice("k1"), Slice("v1")));
|
|
|
|
ASSERT_OK(batch.Put(Slice("k2"), Slice("v2")));
|
Modification of WriteBatch to support two phase commit
Summary: Adds three new WriteBatch data types: Prepare(xid), Commit(xid), Rollback(xid). Prepare(xid) should precede the (single) operation to which is applies. There can obviously be multiple Prepare(xid) markers. There should only be one Rollback(xid) or Commit(xid) marker yet not both. None of this logic is currently enforced and will most likely be implemented further up such as in the memtableinserter. All three markers are similar to PutLogData in that they are writebatch meta-data, ie stored but not counted. All three markers differ from PutLogData in that they will actually be written to disk. As for WriteBatchWithIndex, Prepare, Commit, Rollback are all implemented just as PutLogData and none are tested just as PutLogData.
Test Plan: single unit test in write_batch_test.
Reviewers: hermanlee4, sdong, anthony
Subscribers: leveldb, dhruba, vasilep, andrewkr
Differential Revision: https://reviews.facebook.net/D57867
2016-04-08 06:35:51 +00:00
|
|
|
batch.SetSavePoint();
|
2020-10-20 20:17:17 +00:00
|
|
|
ASSERT_OK(WriteBatchInternal::MarkEndPrepare(&batch, Slice("xid1")));
|
Modification of WriteBatch to support two phase commit
Summary: Adds three new WriteBatch data types: Prepare(xid), Commit(xid), Rollback(xid). Prepare(xid) should precede the (single) operation to which is applies. There can obviously be multiple Prepare(xid) markers. There should only be one Rollback(xid) or Commit(xid) marker yet not both. None of this logic is currently enforced and will most likely be implemented further up such as in the memtableinserter. All three markers are similar to PutLogData in that they are writebatch meta-data, ie stored but not counted. All three markers differ from PutLogData in that they will actually be written to disk. As for WriteBatchWithIndex, Prepare, Commit, Rollback are all implemented just as PutLogData and none are tested just as PutLogData.
Test Plan: single unit test in write_batch_test.
Reviewers: hermanlee4, sdong, anthony
Subscribers: leveldb, dhruba, vasilep, andrewkr
Differential Revision: https://reviews.facebook.net/D57867
2016-04-08 06:35:51 +00:00
|
|
|
Status s = batch.RollbackToSavePoint();
|
|
|
|
ASSERT_EQ(s, Status::NotFound());
|
2020-10-20 20:17:17 +00:00
|
|
|
ASSERT_OK(WriteBatchInternal::MarkCommit(&batch, Slice("xid1")));
|
|
|
|
ASSERT_OK(WriteBatchInternal::MarkRollback(&batch, Slice("xid1")));
|
2019-09-09 18:22:28 +00:00
|
|
|
ASSERT_EQ(2u, batch.Count());
|
Modification of WriteBatch to support two phase commit
Summary: Adds three new WriteBatch data types: Prepare(xid), Commit(xid), Rollback(xid). Prepare(xid) should precede the (single) operation to which is applies. There can obviously be multiple Prepare(xid) markers. There should only be one Rollback(xid) or Commit(xid) marker yet not both. None of this logic is currently enforced and will most likely be implemented further up such as in the memtableinserter. All three markers are similar to PutLogData in that they are writebatch meta-data, ie stored but not counted. All three markers differ from PutLogData in that they will actually be written to disk. As for WriteBatchWithIndex, Prepare, Commit, Rollback are all implemented just as PutLogData and none are tested just as PutLogData.
Test Plan: single unit test in write_batch_test.
Reviewers: hermanlee4, sdong, anthony
Subscribers: leveldb, dhruba, vasilep, andrewkr
Differential Revision: https://reviews.facebook.net/D57867
2016-04-08 06:35:51 +00:00
|
|
|
|
|
|
|
TestHandler handler;
|
2020-10-20 20:17:17 +00:00
|
|
|
ASSERT_OK(batch.Iterate(&handler));
|
Modification of WriteBatch to support two phase commit
Summary: Adds three new WriteBatch data types: Prepare(xid), Commit(xid), Rollback(xid). Prepare(xid) should precede the (single) operation to which is applies. There can obviously be multiple Prepare(xid) markers. There should only be one Rollback(xid) or Commit(xid) marker yet not both. None of this logic is currently enforced and will most likely be implemented further up such as in the memtableinserter. All three markers are similar to PutLogData in that they are writebatch meta-data, ie stored but not counted. All three markers differ from PutLogData in that they will actually be written to disk. As for WriteBatchWithIndex, Prepare, Commit, Rollback are all implemented just as PutLogData and none are tested just as PutLogData.
Test Plan: single unit test in write_batch_test.
Reviewers: hermanlee4, sdong, anthony
Subscribers: leveldb, dhruba, vasilep, andrewkr
Differential Revision: https://reviews.facebook.net/D57867
2016-04-08 06:35:51 +00:00
|
|
|
ASSERT_EQ(
|
2018-07-07 00:17:36 +00:00
|
|
|
"MarkBeginPrepare(false)"
|
Modification of WriteBatch to support two phase commit
Summary: Adds three new WriteBatch data types: Prepare(xid), Commit(xid), Rollback(xid). Prepare(xid) should precede the (single) operation to which is applies. There can obviously be multiple Prepare(xid) markers. There should only be one Rollback(xid) or Commit(xid) marker yet not both. None of this logic is currently enforced and will most likely be implemented further up such as in the memtableinserter. All three markers are similar to PutLogData in that they are writebatch meta-data, ie stored but not counted. All three markers differ from PutLogData in that they will actually be written to disk. As for WriteBatchWithIndex, Prepare, Commit, Rollback are all implemented just as PutLogData and none are tested just as PutLogData.
Test Plan: single unit test in write_batch_test.
Reviewers: hermanlee4, sdong, anthony
Subscribers: leveldb, dhruba, vasilep, andrewkr
Differential Revision: https://reviews.facebook.net/D57867
2016-04-08 06:35:51 +00:00
|
|
|
"Put(k1, v1)"
|
|
|
|
"Put(k2, v2)"
|
|
|
|
"MarkEndPrepare(xid1)"
|
|
|
|
"MarkCommit(xid1)"
|
|
|
|
"MarkRollback(xid1)",
|
|
|
|
handler.seen);
|
|
|
|
}
|
|
|
|
|
2016-02-01 19:03:28 +00:00
|
|
|
// It requires more than 30GB of memory to run the test. With single memory
|
|
|
|
// allocation of more than 30GB.
|
|
|
|
// Not all platform can run it. Also it runs a long time. So disable it.
|
|
|
|
TEST_F(WriteBatchTest, DISABLED_ManyUpdates) {
|
|
|
|
// Insert key and value of 3GB and push total batch size to 12GB.
|
2016-02-02 00:07:53 +00:00
|
|
|
static const size_t kKeyValueSize = 4u;
|
2020-09-23 22:24:57 +00:00
|
|
|
static const uint32_t kNumUpdates = uint32_t{3} << 30;
|
2016-02-01 19:03:28 +00:00
|
|
|
std::string raw(kKeyValueSize, 'A');
|
|
|
|
WriteBatch batch(kNumUpdates * (4 + kKeyValueSize * 2) + 1024u);
|
|
|
|
char c = 'A';
|
|
|
|
for (uint32_t i = 0; i < kNumUpdates; i++) {
|
|
|
|
if (c > 'Z') {
|
|
|
|
c = 'A';
|
|
|
|
}
|
|
|
|
raw[0] = c;
|
|
|
|
raw[raw.length() - 1] = c;
|
|
|
|
c++;
|
2020-10-20 20:17:17 +00:00
|
|
|
ASSERT_OK(batch.Put(raw, raw));
|
2016-02-01 19:03:28 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
ASSERT_EQ(kNumUpdates, batch.Count());
|
|
|
|
|
|
|
|
struct NoopHandler : public WriteBatch::Handler {
|
|
|
|
uint32_t num_seen = 0;
|
|
|
|
char expected_char = 'A';
|
2019-02-19 21:36:04 +00:00
|
|
|
Status PutCF(uint32_t /*column_family_id*/, const Slice& key,
|
|
|
|
const Slice& value) override {
|
2016-02-01 19:03:28 +00:00
|
|
|
EXPECT_EQ(kKeyValueSize, key.size());
|
|
|
|
EXPECT_EQ(kKeyValueSize, value.size());
|
|
|
|
EXPECT_EQ(expected_char, key[0]);
|
|
|
|
EXPECT_EQ(expected_char, value[0]);
|
|
|
|
EXPECT_EQ(expected_char, key[kKeyValueSize - 1]);
|
|
|
|
EXPECT_EQ(expected_char, value[kKeyValueSize - 1]);
|
|
|
|
expected_char++;
|
|
|
|
if (expected_char > 'Z') {
|
|
|
|
expected_char = 'A';
|
|
|
|
}
|
|
|
|
++num_seen;
|
|
|
|
return Status::OK();
|
|
|
|
}
|
2019-02-19 21:36:04 +00:00
|
|
|
Status DeleteCF(uint32_t /*column_family_id*/,
|
|
|
|
const Slice& /*key*/) override {
|
2017-07-17 04:23:33 +00:00
|
|
|
ADD_FAILURE();
|
2016-02-01 19:03:28 +00:00
|
|
|
return Status::OK();
|
|
|
|
}
|
2019-02-19 21:36:04 +00:00
|
|
|
Status SingleDeleteCF(uint32_t /*column_family_id*/,
|
|
|
|
const Slice& /*key*/) override {
|
2017-07-17 04:23:33 +00:00
|
|
|
ADD_FAILURE();
|
2016-02-01 19:03:28 +00:00
|
|
|
return Status::OK();
|
|
|
|
}
|
2019-02-19 21:36:04 +00:00
|
|
|
Status MergeCF(uint32_t /*column_family_id*/, const Slice& /*key*/,
|
|
|
|
const Slice& /*value*/) override {
|
2017-07-17 04:23:33 +00:00
|
|
|
ADD_FAILURE();
|
2016-02-01 19:03:28 +00:00
|
|
|
return Status::OK();
|
|
|
|
}
|
2019-02-19 21:36:04 +00:00
|
|
|
void LogData(const Slice& /*blob*/) override { ADD_FAILURE(); }
|
|
|
|
bool Continue() override { return num_seen < kNumUpdates; }
|
2016-02-01 19:03:28 +00:00
|
|
|
} handler;
|
|
|
|
|
2020-10-20 20:17:17 +00:00
|
|
|
ASSERT_OK(batch.Iterate(&handler));
|
2016-02-01 19:03:28 +00:00
|
|
|
ASSERT_EQ(kNumUpdates, handler.num_seen);
|
|
|
|
}
|
|
|
|
|
|
|
|
// The test requires more than 18GB memory to run it, with single memory
|
|
|
|
// allocation of more than 12GB. Not all the platform can run it. So disable it.
|
|
|
|
TEST_F(WriteBatchTest, DISABLED_LargeKeyValue) {
|
|
|
|
// Insert key and value of 3GB and push total batch size to 12GB.
|
2016-02-02 00:07:53 +00:00
|
|
|
static const size_t kKeyValueSize = 3221225472u;
|
2016-02-01 19:03:28 +00:00
|
|
|
std::string raw(kKeyValueSize, 'A');
|
2017-03-30 23:47:19 +00:00
|
|
|
WriteBatch batch(size_t(12884901888ull + 1024u));
|
2016-02-01 19:03:28 +00:00
|
|
|
for (char i = 0; i < 2; i++) {
|
|
|
|
raw[0] = 'A' + i;
|
|
|
|
raw[raw.length() - 1] = 'A' - i;
|
2020-10-20 20:17:17 +00:00
|
|
|
ASSERT_OK(batch.Put(raw, raw));
|
2016-02-01 19:03:28 +00:00
|
|
|
}
|
|
|
|
|
2019-09-09 18:22:28 +00:00
|
|
|
ASSERT_EQ(2u, batch.Count());
|
2016-02-01 19:03:28 +00:00
|
|
|
|
|
|
|
struct NoopHandler : public WriteBatch::Handler {
|
|
|
|
int num_seen = 0;
|
2019-02-19 21:36:04 +00:00
|
|
|
Status PutCF(uint32_t /*column_family_id*/, const Slice& key,
|
|
|
|
const Slice& value) override {
|
2016-02-01 19:03:28 +00:00
|
|
|
EXPECT_EQ(kKeyValueSize, key.size());
|
|
|
|
EXPECT_EQ(kKeyValueSize, value.size());
|
|
|
|
EXPECT_EQ('A' + num_seen, key[0]);
|
|
|
|
EXPECT_EQ('A' + num_seen, value[0]);
|
|
|
|
EXPECT_EQ('A' - num_seen, key[kKeyValueSize - 1]);
|
|
|
|
EXPECT_EQ('A' - num_seen, value[kKeyValueSize - 1]);
|
|
|
|
++num_seen;
|
|
|
|
return Status::OK();
|
|
|
|
}
|
2019-02-19 21:36:04 +00:00
|
|
|
Status DeleteCF(uint32_t /*column_family_id*/,
|
|
|
|
const Slice& /*key*/) override {
|
2017-07-17 04:23:33 +00:00
|
|
|
ADD_FAILURE();
|
2016-02-01 19:03:28 +00:00
|
|
|
return Status::OK();
|
|
|
|
}
|
2019-02-19 21:36:04 +00:00
|
|
|
Status SingleDeleteCF(uint32_t /*column_family_id*/,
|
|
|
|
const Slice& /*key*/) override {
|
2017-07-17 04:23:33 +00:00
|
|
|
ADD_FAILURE();
|
2016-02-01 19:03:28 +00:00
|
|
|
return Status::OK();
|
|
|
|
}
|
2019-02-19 21:36:04 +00:00
|
|
|
Status MergeCF(uint32_t /*column_family_id*/, const Slice& /*key*/,
|
|
|
|
const Slice& /*value*/) override {
|
2017-07-17 04:23:33 +00:00
|
|
|
ADD_FAILURE();
|
2016-02-01 19:03:28 +00:00
|
|
|
return Status::OK();
|
|
|
|
}
|
2019-02-19 21:36:04 +00:00
|
|
|
void LogData(const Slice& /*blob*/) override { ADD_FAILURE(); }
|
|
|
|
bool Continue() override { return num_seen < 2; }
|
2016-02-01 19:03:28 +00:00
|
|
|
} handler;
|
|
|
|
|
2020-10-20 20:17:17 +00:00
|
|
|
ASSERT_OK(batch.Iterate(&handler));
|
2016-02-01 19:03:28 +00:00
|
|
|
ASSERT_EQ(2, handler.num_seen);
|
|
|
|
}
|
|
|
|
|
2015-03-17 21:08:00 +00:00
|
|
|
TEST_F(WriteBatchTest, Continue) {
|
2013-08-22 01:27:48 +00:00
|
|
|
WriteBatch batch;
|
|
|
|
|
|
|
|
struct Handler : public TestHandler {
|
|
|
|
int num_seen = 0;
|
2019-02-19 21:36:04 +00:00
|
|
|
Status PutCF(uint32_t column_family_id, const Slice& key,
|
|
|
|
const Slice& value) override {
|
2013-08-22 01:27:48 +00:00
|
|
|
++num_seen;
|
2014-02-26 01:30:54 +00:00
|
|
|
return TestHandler::PutCF(column_family_id, key, value);
|
2013-08-14 23:32:46 +00:00
|
|
|
}
|
2019-02-19 21:36:04 +00:00
|
|
|
Status DeleteCF(uint32_t column_family_id, const Slice& key) override {
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 18:42:56 +00:00
|
|
|
++num_seen;
|
|
|
|
return TestHandler::DeleteCF(column_family_id, key);
|
|
|
|
}
|
2019-02-19 21:36:04 +00:00
|
|
|
Status SingleDeleteCF(uint32_t column_family_id,
|
|
|
|
const Slice& key) override {
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 18:42:56 +00:00
|
|
|
++num_seen;
|
|
|
|
return TestHandler::SingleDeleteCF(column_family_id, key);
|
|
|
|
}
|
2019-02-19 21:36:04 +00:00
|
|
|
Status MergeCF(uint32_t column_family_id, const Slice& key,
|
|
|
|
const Slice& value) override {
|
2013-08-22 01:27:48 +00:00
|
|
|
++num_seen;
|
2014-02-26 01:30:54 +00:00
|
|
|
return TestHandler::MergeCF(column_family_id, key, value);
|
2013-08-14 23:32:46 +00:00
|
|
|
}
|
2019-02-19 21:36:04 +00:00
|
|
|
void LogData(const Slice& blob) override {
|
2013-08-22 01:27:48 +00:00
|
|
|
++num_seen;
|
|
|
|
TestHandler::LogData(blob);
|
2013-08-14 23:32:46 +00:00
|
|
|
}
|
2019-02-19 21:36:04 +00:00
|
|
|
bool Continue() override { return num_seen < 5; }
|
2013-08-14 23:32:46 +00:00
|
|
|
} handler;
|
2013-08-22 01:27:48 +00:00
|
|
|
|
2020-10-20 20:17:17 +00:00
|
|
|
ASSERT_OK(batch.Put(Slice("k1"), Slice("v1")));
|
|
|
|
ASSERT_OK(batch.Put(Slice("k2"), Slice("v2")));
|
|
|
|
ASSERT_OK(batch.PutLogData(Slice("blob1")));
|
|
|
|
ASSERT_OK(batch.Delete(Slice("k1")));
|
|
|
|
ASSERT_OK(batch.SingleDelete(Slice("k2")));
|
|
|
|
ASSERT_OK(batch.PutLogData(Slice("blob2")));
|
|
|
|
ASSERT_OK(batch.Merge(Slice("foo"), Slice("bar")));
|
|
|
|
ASSERT_OK(batch.Iterate(&handler));
|
2013-08-14 23:32:46 +00:00
|
|
|
ASSERT_EQ(
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 18:42:56 +00:00
|
|
|
"Put(k1, v1)"
|
|
|
|
"Put(k2, v2)"
|
|
|
|
"LogData(blob1)"
|
|
|
|
"Delete(k1)"
|
|
|
|
"SingleDelete(k2)",
|
|
|
|
handler.seen);
|
2013-08-14 23:32:46 +00:00
|
|
|
}
|
|
|
|
|
2015-03-17 21:08:00 +00:00
|
|
|
TEST_F(WriteBatchTest, PutGatherSlices) {
|
2013-11-07 20:37:58 +00:00
|
|
|
WriteBatch batch;
|
2020-10-20 20:17:17 +00:00
|
|
|
ASSERT_OK(batch.Put(Slice("foo"), Slice("bar")));
|
2013-11-07 20:37:58 +00:00
|
|
|
|
|
|
|
{
|
|
|
|
// Try a write where the key is one slice but the value is two
|
|
|
|
Slice key_slice("baz");
|
2022-11-02 21:34:24 +00:00
|
|
|
Slice value_slices[2] = {Slice("header"), Slice("payload")};
|
2020-10-20 20:17:17 +00:00
|
|
|
ASSERT_OK(
|
|
|
|
batch.Put(SliceParts(&key_slice, 1), SliceParts(value_slices, 2)));
|
2013-11-07 20:37:58 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
// One where the key is composite but the value is a single slice
|
2022-11-02 21:34:24 +00:00
|
|
|
Slice key_slices[3] = {Slice("key"), Slice("part2"), Slice("part3")};
|
2013-11-07 20:37:58 +00:00
|
|
|
Slice value_slice("value");
|
2020-10-20 20:17:17 +00:00
|
|
|
ASSERT_OK(
|
|
|
|
batch.Put(SliceParts(key_slices, 3), SliceParts(&value_slice, 1)));
|
2013-11-07 20:37:58 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
WriteBatchInternal::SetSequence(&batch, 100);
|
2022-11-02 21:34:24 +00:00
|
|
|
ASSERT_EQ(
|
|
|
|
"Put(baz, headerpayload)@101"
|
|
|
|
"Put(foo, bar)@100"
|
|
|
|
"Put(keypart2part3, value)@102",
|
|
|
|
PrintContents(&batch));
|
2019-09-09 18:22:28 +00:00
|
|
|
ASSERT_EQ(3u, batch.Count());
|
2013-11-07 20:37:58 +00:00
|
|
|
}
|
|
|
|
|
2014-03-14 20:40:06 +00:00
|
|
|
namespace {
|
|
|
|
class ColumnFamilyHandleImplDummy : public ColumnFamilyHandleImpl {
|
|
|
|
public:
|
2014-04-25 19:21:34 +00:00
|
|
|
explicit ColumnFamilyHandleImplDummy(int id)
|
2014-03-14 20:40:06 +00:00
|
|
|
: ColumnFamilyHandleImpl(nullptr, nullptr, nullptr), id_(id) {}
|
2021-12-01 06:31:41 +00:00
|
|
|
explicit ColumnFamilyHandleImplDummy(int id, const Comparator* ucmp)
|
|
|
|
: ColumnFamilyHandleImpl(nullptr, nullptr, nullptr),
|
|
|
|
id_(id),
|
|
|
|
ucmp_(ucmp) {}
|
2014-03-14 20:40:06 +00:00
|
|
|
uint32_t GetID() const override { return id_; }
|
2021-12-01 06:31:41 +00:00
|
|
|
const Comparator* GetComparator() const override { return ucmp_; }
|
2014-03-14 20:40:06 +00:00
|
|
|
|
|
|
|
private:
|
|
|
|
uint32_t id_;
|
2021-12-01 06:31:41 +00:00
|
|
|
const Comparator* const ucmp_ = BytewiseComparator();
|
2014-03-14 20:40:06 +00:00
|
|
|
};
|
2022-11-02 21:34:24 +00:00
|
|
|
} // anonymous namespace
|
2014-03-14 20:40:06 +00:00
|
|
|
|
2023-11-07 00:52:51 +00:00
|
|
|
TEST_F(WriteBatchTest, AttributeGroupTest) {
|
|
|
|
WriteBatch batch;
|
|
|
|
ColumnFamilyHandleImplDummy zero(0), two(2);
|
|
|
|
AttributeGroups foo_ags;
|
|
|
|
WideColumn zero_col_1{"0_c_1_n", "0_c_1_v"};
|
|
|
|
WideColumn zero_col_2{"0_c_2_n", "0_c_2_v"};
|
|
|
|
WideColumns zero_col_1_col_2{zero_col_1, zero_col_2};
|
|
|
|
|
|
|
|
WideColumn two_col_1{"2_c_1_n", "2_c_1_v"};
|
|
|
|
WideColumn two_col_2{"2_c_2_n", "2_c_2_v"};
|
|
|
|
WideColumns two_col_1_col_2{two_col_1, two_col_2};
|
|
|
|
|
|
|
|
foo_ags.emplace_back(&zero, zero_col_1_col_2);
|
|
|
|
foo_ags.emplace_back(&two, two_col_1_col_2);
|
|
|
|
|
|
|
|
ASSERT_OK(batch.PutEntity("foo", foo_ags));
|
|
|
|
|
|
|
|
TestHandler handler;
|
|
|
|
ASSERT_OK(batch.Iterate(&handler));
|
|
|
|
ASSERT_EQ(
|
|
|
|
"PutEntity(foo, 0_c_1_n:0_c_1_v "
|
|
|
|
"0_c_2_n:0_c_2_v)"
|
|
|
|
"PutEntityCF(2, foo, 2_c_1_n:2_c_1_v "
|
|
|
|
"2_c_2_n:2_c_2_v)",
|
|
|
|
handler.seen);
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(WriteBatchTest, AttributeGroupSavePointTest) {
|
|
|
|
WriteBatch batch;
|
|
|
|
batch.SetSavePoint();
|
|
|
|
|
|
|
|
ColumnFamilyHandleImplDummy zero(0), two(2), three(3);
|
|
|
|
AttributeGroups foo_ags;
|
|
|
|
WideColumn zero_col_1{"0_c_1_n", "0_c_1_v"};
|
|
|
|
WideColumn zero_col_2{"0_c_2_n", "0_c_2_v"};
|
|
|
|
WideColumns zero_col_1_col_2{zero_col_1, zero_col_2};
|
|
|
|
|
|
|
|
WideColumn two_col_1{"2_c_1_n", "2_c_1_v"};
|
|
|
|
WideColumn two_col_2{"2_c_2_n", "2_c_2_v"};
|
|
|
|
WideColumns two_col_1_col_2{two_col_1, two_col_2};
|
|
|
|
|
|
|
|
foo_ags.emplace_back(&zero, zero_col_1_col_2);
|
|
|
|
foo_ags.emplace_back(&two, two_col_1_col_2);
|
|
|
|
|
|
|
|
AttributeGroups bar_ags;
|
|
|
|
WideColumn three_col_1{"3_c_1_n", "3_c_1_v"};
|
|
|
|
WideColumn three_col_2{"3_c_2_n", "3_c_2_v"};
|
|
|
|
WideColumns three_col_1_col_2{three_col_1, three_col_2};
|
|
|
|
|
|
|
|
bar_ags.emplace_back(&zero, zero_col_1_col_2);
|
|
|
|
bar_ags.emplace_back(&three, three_col_1_col_2);
|
|
|
|
|
|
|
|
ASSERT_OK(batch.PutEntity("foo", foo_ags));
|
|
|
|
batch.SetSavePoint();
|
|
|
|
|
|
|
|
ASSERT_OK(batch.PutEntity("bar", bar_ags));
|
|
|
|
|
|
|
|
TestHandler handler;
|
|
|
|
ASSERT_OK(batch.Iterate(&handler));
|
|
|
|
ASSERT_EQ(
|
|
|
|
"PutEntity(foo, 0_c_1_n:0_c_1_v 0_c_2_n:0_c_2_v)"
|
|
|
|
"PutEntityCF(2, foo, 2_c_1_n:2_c_1_v 2_c_2_n:2_c_2_v)"
|
|
|
|
"PutEntity(bar, 0_c_1_n:0_c_1_v 0_c_2_n:0_c_2_v)"
|
|
|
|
"PutEntityCF(3, bar, 3_c_1_n:3_c_1_v 3_c_2_n:3_c_2_v)",
|
|
|
|
handler.seen);
|
|
|
|
|
|
|
|
ASSERT_OK(batch.RollbackToSavePoint());
|
|
|
|
|
|
|
|
handler.seen.clear();
|
|
|
|
ASSERT_OK(batch.Iterate(&handler));
|
|
|
|
ASSERT_EQ(
|
|
|
|
"PutEntity(foo, 0_c_1_n:0_c_1_v 0_c_2_n:0_c_2_v)"
|
|
|
|
"PutEntityCF(2, foo, 2_c_1_n:2_c_1_v 2_c_2_n:2_c_2_v)",
|
|
|
|
handler.seen);
|
|
|
|
}
|
|
|
|
|
2015-03-17 21:08:00 +00:00
|
|
|
TEST_F(WriteBatchTest, ColumnFamiliesBatchTest) {
|
2014-01-07 22:41:42 +00:00
|
|
|
WriteBatch batch;
|
2014-03-14 20:40:06 +00:00
|
|
|
ColumnFamilyHandleImplDummy zero(0), two(2), three(3), eight(8);
|
2020-10-20 20:17:17 +00:00
|
|
|
ASSERT_OK(batch.Put(&zero, Slice("foo"), Slice("bar")));
|
|
|
|
ASSERT_OK(batch.Put(&two, Slice("twofoo"), Slice("bar2")));
|
|
|
|
ASSERT_OK(batch.Put(&eight, Slice("eightfoo"), Slice("bar8")));
|
|
|
|
ASSERT_OK(batch.Delete(&eight, Slice("eightfoo")));
|
|
|
|
ASSERT_OK(batch.SingleDelete(&two, Slice("twofoo")));
|
|
|
|
ASSERT_OK(batch.DeleteRange(&two, Slice("3foo"), Slice("4foo")));
|
|
|
|
ASSERT_OK(batch.Merge(&three, Slice("threethree"), Slice("3three")));
|
|
|
|
ASSERT_OK(batch.Put(&zero, Slice("foo"), Slice("bar")));
|
|
|
|
ASSERT_OK(batch.Merge(Slice("omom"), Slice("nom")));
|
Add initial support for TimedPut API (#12419)
Summary:
This PR adds support for `TimedPut` API. We introduced a new type `kTypeValuePreferredSeqno` for entries added to the DB via the `TimedPut` API.
The life cycle of such an entry on the write/flush/compaction paths are:
1) It is initially added to memtable as:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, write_unix_time}`
2) When it's flushed to L0 sst files, it's converted to:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, preferred_seqno}`
when we have easy access to the seqno to time mapping.
3) During compaction, if certain conditions are met, we swap in the `preferred_seqno` and the entry will become:
`<user_key, preferred_seqno, kTypeValue>: value`. This step helps fast track these entries to the cold tier if they are eligible after the sequence number swap.
On the read path:
A `kTypeValuePreferredSeqno` entry acts the same as a `kTypeValue` entry, the unix_write_time/preferred seqno part packed in value is completely ignored.
Needed follow ups:
1) The seqno to time mapping accessible in flush needs to be extended to cover the `write_unix_time` for possible `kTypeValuePreferredSeqno` entries. This also means we need to track these `write_unix_time` in memtable.
2) Compaction filter support for the new `kTypeValuePreferredSeqno` type for feature parity with other `kTypeValue` and equivalent types.
3) Stress test coverage for the feature
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12419
Test Plan: Added unit tests
Reviewed By: pdillinger
Differential Revision: D54920296
Pulled By: jowlyzhang
fbshipit-source-id: c8b43f7a7c465e569141770e93c748371ff1da9e
2024-03-14 22:44:55 +00:00
|
|
|
ASSERT_OK(batch.TimedPut(&zero, Slice("foo"), Slice("bar"),
|
|
|
|
/*write_unix_time*/ 0u));
|
2014-01-07 22:41:42 +00:00
|
|
|
|
|
|
|
TestHandler handler;
|
2020-10-20 20:17:17 +00:00
|
|
|
ASSERT_OK(batch.Iterate(&handler));
|
2014-01-07 22:41:42 +00:00
|
|
|
ASSERT_EQ(
|
|
|
|
"Put(foo, bar)"
|
|
|
|
"PutCF(2, twofoo, bar2)"
|
|
|
|
"PutCF(8, eightfoo, bar8)"
|
|
|
|
"DeleteCF(8, eightfoo)"
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 18:42:56 +00:00
|
|
|
"SingleDeleteCF(2, twofoo)"
|
2016-08-16 15:16:04 +00:00
|
|
|
"DeleteRangeCF(2, 3foo, 4foo)"
|
2014-01-07 22:41:42 +00:00
|
|
|
"MergeCF(3, threethree, 3three)"
|
|
|
|
"Put(foo, bar)"
|
Add initial support for TimedPut API (#12419)
Summary:
This PR adds support for `TimedPut` API. We introduced a new type `kTypeValuePreferredSeqno` for entries added to the DB via the `TimedPut` API.
The life cycle of such an entry on the write/flush/compaction paths are:
1) It is initially added to memtable as:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, write_unix_time}`
2) When it's flushed to L0 sst files, it's converted to:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, preferred_seqno}`
when we have easy access to the seqno to time mapping.
3) During compaction, if certain conditions are met, we swap in the `preferred_seqno` and the entry will become:
`<user_key, preferred_seqno, kTypeValue>: value`. This step helps fast track these entries to the cold tier if they are eligible after the sequence number swap.
On the read path:
A `kTypeValuePreferredSeqno` entry acts the same as a `kTypeValue` entry, the unix_write_time/preferred seqno part packed in value is completely ignored.
Needed follow ups:
1) The seqno to time mapping accessible in flush needs to be extended to cover the `write_unix_time` for possible `kTypeValuePreferredSeqno` entries. This also means we need to track these `write_unix_time` in memtable.
2) Compaction filter support for the new `kTypeValuePreferredSeqno` type for feature parity with other `kTypeValue` and equivalent types.
3) Stress test coverage for the feature
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12419
Test Plan: Added unit tests
Reviewed By: pdillinger
Differential Revision: D54920296
Pulled By: jowlyzhang
fbshipit-source-id: c8b43f7a7c465e569141770e93c748371ff1da9e
2024-03-14 22:44:55 +00:00
|
|
|
"Merge(omom, nom)"
|
|
|
|
"TimedPut(foo, bar, 0)",
|
2014-01-07 22:41:42 +00:00
|
|
|
handler.seen);
|
|
|
|
}
|
|
|
|
|
2015-03-17 21:08:00 +00:00
|
|
|
TEST_F(WriteBatchTest, ColumnFamiliesBatchWithIndexTest) {
|
2014-09-22 18:37:35 +00:00
|
|
|
WriteBatchWithIndex batch;
|
2014-08-18 22:19:17 +00:00
|
|
|
ColumnFamilyHandleImplDummy zero(0), two(2), three(3), eight(8);
|
2020-10-20 20:17:17 +00:00
|
|
|
ASSERT_OK(batch.Put(&zero, Slice("foo"), Slice("bar")));
|
|
|
|
ASSERT_OK(batch.Put(&two, Slice("twofoo"), Slice("bar2")));
|
|
|
|
ASSERT_OK(batch.Put(&eight, Slice("eightfoo"), Slice("bar8")));
|
|
|
|
ASSERT_OK(batch.Delete(&eight, Slice("eightfoo")));
|
|
|
|
ASSERT_OK(batch.SingleDelete(&two, Slice("twofoo")));
|
|
|
|
ASSERT_OK(batch.Merge(&three, Slice("threethree"), Slice("3three")));
|
|
|
|
ASSERT_OK(batch.Put(&zero, Slice("foo"), Slice("bar")));
|
|
|
|
ASSERT_OK(batch.Merge(Slice("omom"), Slice("nom")));
|
2023-11-09 23:58:07 +00:00
|
|
|
ASSERT_TRUE(
|
|
|
|
batch.TimedPut(&zero, Slice("foo"), Slice("bar"), 0u).IsNotSupported());
|
2014-08-18 22:19:17 +00:00
|
|
|
|
|
|
|
std::unique_ptr<WBWIIterator> iter;
|
|
|
|
|
|
|
|
iter.reset(batch.NewIterator(&eight));
|
|
|
|
iter->Seek("eightfoo");
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ(WriteType::kPutRecord, iter->Entry().type);
|
|
|
|
ASSERT_EQ("eightfoo", iter->Entry().key.ToString());
|
|
|
|
ASSERT_EQ("bar8", iter->Entry().value.ToString());
|
|
|
|
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ(WriteType::kDeleteRecord, iter->Entry().type);
|
|
|
|
ASSERT_EQ("eightfoo", iter->Entry().key.ToString());
|
|
|
|
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_TRUE(!iter->Valid());
|
|
|
|
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 18:42:56 +00:00
|
|
|
iter.reset(batch.NewIterator(&two));
|
|
|
|
iter->Seek("twofoo");
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ(WriteType::kPutRecord, iter->Entry().type);
|
|
|
|
ASSERT_EQ("twofoo", iter->Entry().key.ToString());
|
|
|
|
ASSERT_EQ("bar2", iter->Entry().value.ToString());
|
|
|
|
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ(WriteType::kSingleDeleteRecord, iter->Entry().type);
|
|
|
|
ASSERT_EQ("twofoo", iter->Entry().key.ToString());
|
|
|
|
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_TRUE(!iter->Valid());
|
|
|
|
|
2014-08-18 22:19:17 +00:00
|
|
|
iter.reset(batch.NewIterator());
|
|
|
|
iter->Seek("gggg");
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ(WriteType::kMergeRecord, iter->Entry().type);
|
|
|
|
ASSERT_EQ("omom", iter->Entry().key.ToString());
|
|
|
|
ASSERT_EQ("nom", iter->Entry().value.ToString());
|
|
|
|
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_TRUE(!iter->Valid());
|
|
|
|
|
|
|
|
iter.reset(batch.NewIterator(&zero));
|
|
|
|
iter->Seek("foo");
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ(WriteType::kPutRecord, iter->Entry().type);
|
|
|
|
ASSERT_EQ("foo", iter->Entry().key.ToString());
|
|
|
|
ASSERT_EQ("bar", iter->Entry().value.ToString());
|
|
|
|
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ(WriteType::kPutRecord, iter->Entry().type);
|
|
|
|
ASSERT_EQ("foo", iter->Entry().key.ToString());
|
|
|
|
ASSERT_EQ("bar", iter->Entry().value.ToString());
|
|
|
|
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ(WriteType::kMergeRecord, iter->Entry().type);
|
|
|
|
ASSERT_EQ("omom", iter->Entry().key.ToString());
|
|
|
|
ASSERT_EQ("nom", iter->Entry().value.ToString());
|
|
|
|
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_TRUE(!iter->Valid());
|
|
|
|
|
|
|
|
TestHandler handler;
|
2020-10-20 20:17:17 +00:00
|
|
|
ASSERT_OK(batch.GetWriteBatch()->Iterate(&handler));
|
2014-08-18 22:19:17 +00:00
|
|
|
ASSERT_EQ(
|
|
|
|
"Put(foo, bar)"
|
|
|
|
"PutCF(2, twofoo, bar2)"
|
|
|
|
"PutCF(8, eightfoo, bar8)"
|
|
|
|
"DeleteCF(8, eightfoo)"
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 18:42:56 +00:00
|
|
|
"SingleDeleteCF(2, twofoo)"
|
2014-08-18 22:19:17 +00:00
|
|
|
"MergeCF(3, threethree, 3three)"
|
|
|
|
"Put(foo, bar)"
|
|
|
|
"Merge(omom, nom)",
|
|
|
|
handler.seen);
|
|
|
|
}
|
|
|
|
|
2015-07-11 03:15:45 +00:00
|
|
|
TEST_F(WriteBatchTest, SavePointTest) {
|
|
|
|
Status s;
|
|
|
|
WriteBatch batch;
|
|
|
|
batch.SetSavePoint();
|
|
|
|
|
2020-10-20 20:17:17 +00:00
|
|
|
ASSERT_OK(batch.Put("A", "a"));
|
|
|
|
ASSERT_OK(batch.Put("B", "b"));
|
2015-07-11 03:15:45 +00:00
|
|
|
batch.SetSavePoint();
|
|
|
|
|
2020-10-20 20:17:17 +00:00
|
|
|
ASSERT_OK(batch.Put("C", "c"));
|
|
|
|
ASSERT_OK(batch.Delete("A"));
|
2015-07-11 03:15:45 +00:00
|
|
|
batch.SetSavePoint();
|
|
|
|
batch.SetSavePoint();
|
|
|
|
|
|
|
|
ASSERT_OK(batch.RollbackToSavePoint());
|
|
|
|
ASSERT_EQ(
|
|
|
|
"Delete(A)@3"
|
|
|
|
"Put(A, a)@0"
|
|
|
|
"Put(B, b)@1"
|
|
|
|
"Put(C, c)@2",
|
|
|
|
PrintContents(&batch));
|
|
|
|
|
|
|
|
ASSERT_OK(batch.RollbackToSavePoint());
|
|
|
|
ASSERT_OK(batch.RollbackToSavePoint());
|
|
|
|
ASSERT_EQ(
|
|
|
|
"Put(A, a)@0"
|
|
|
|
"Put(B, b)@1",
|
|
|
|
PrintContents(&batch));
|
|
|
|
|
2020-10-20 20:17:17 +00:00
|
|
|
ASSERT_OK(batch.Delete("A"));
|
|
|
|
ASSERT_OK(batch.Put("B", "bb"));
|
2015-07-11 03:15:45 +00:00
|
|
|
|
|
|
|
ASSERT_OK(batch.RollbackToSavePoint());
|
|
|
|
ASSERT_EQ("", PrintContents(&batch));
|
|
|
|
|
|
|
|
s = batch.RollbackToSavePoint();
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
ASSERT_EQ("", PrintContents(&batch));
|
|
|
|
|
2020-10-20 20:17:17 +00:00
|
|
|
ASSERT_OK(batch.Put("D", "d"));
|
|
|
|
ASSERT_OK(batch.Delete("A"));
|
2015-07-11 03:15:45 +00:00
|
|
|
|
|
|
|
batch.SetSavePoint();
|
|
|
|
|
2020-10-20 20:17:17 +00:00
|
|
|
ASSERT_OK(batch.Put("A", "aaa"));
|
2015-07-11 03:15:45 +00:00
|
|
|
|
|
|
|
ASSERT_OK(batch.RollbackToSavePoint());
|
|
|
|
ASSERT_EQ(
|
|
|
|
"Delete(A)@1"
|
|
|
|
"Put(D, d)@0",
|
|
|
|
PrintContents(&batch));
|
|
|
|
|
|
|
|
batch.SetSavePoint();
|
|
|
|
|
2020-10-20 20:17:17 +00:00
|
|
|
ASSERT_OK(batch.Put("D", "d"));
|
|
|
|
ASSERT_OK(batch.Delete("A"));
|
2015-07-11 03:15:45 +00:00
|
|
|
|
|
|
|
ASSERT_OK(batch.RollbackToSavePoint());
|
|
|
|
ASSERT_EQ(
|
|
|
|
"Delete(A)@1"
|
|
|
|
"Put(D, d)@0",
|
|
|
|
PrintContents(&batch));
|
|
|
|
|
|
|
|
s = batch.RollbackToSavePoint();
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
ASSERT_EQ(
|
|
|
|
"Delete(A)@1"
|
|
|
|
"Put(D, d)@0",
|
|
|
|
PrintContents(&batch));
|
|
|
|
|
|
|
|
WriteBatch batch2;
|
|
|
|
|
|
|
|
s = batch2.RollbackToSavePoint();
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
ASSERT_EQ("", PrintContents(&batch2));
|
|
|
|
|
2020-10-20 20:17:17 +00:00
|
|
|
ASSERT_OK(batch2.Delete("A"));
|
2015-07-11 03:15:45 +00:00
|
|
|
batch2.SetSavePoint();
|
|
|
|
|
|
|
|
s = batch2.RollbackToSavePoint();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ("Delete(A)@0", PrintContents(&batch2));
|
|
|
|
|
|
|
|
batch2.Clear();
|
|
|
|
ASSERT_EQ("", PrintContents(&batch2));
|
|
|
|
|
|
|
|
batch2.SetSavePoint();
|
|
|
|
|
2020-10-20 20:17:17 +00:00
|
|
|
ASSERT_OK(batch2.Delete("B"));
|
2015-07-11 03:15:45 +00:00
|
|
|
ASSERT_EQ("Delete(B)@0", PrintContents(&batch2));
|
|
|
|
|
|
|
|
batch2.SetSavePoint();
|
|
|
|
s = batch2.RollbackToSavePoint();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ("Delete(B)@0", PrintContents(&batch2));
|
|
|
|
|
|
|
|
s = batch2.RollbackToSavePoint();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ("", PrintContents(&batch2));
|
|
|
|
|
|
|
|
s = batch2.RollbackToSavePoint();
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
ASSERT_EQ("", PrintContents(&batch2));
|
2017-05-03 17:54:07 +00:00
|
|
|
|
|
|
|
WriteBatch batch3;
|
|
|
|
|
|
|
|
s = batch3.PopSavePoint();
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
ASSERT_EQ("", PrintContents(&batch3));
|
|
|
|
|
|
|
|
batch3.SetSavePoint();
|
2020-10-20 20:17:17 +00:00
|
|
|
ASSERT_OK(batch3.Delete("A"));
|
2017-05-03 17:54:07 +00:00
|
|
|
|
|
|
|
s = batch3.PopSavePoint();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ("Delete(A)@0", PrintContents(&batch3));
|
2015-07-11 03:15:45 +00:00
|
|
|
}
|
|
|
|
|
2017-04-10 22:38:34 +00:00
|
|
|
TEST_F(WriteBatchTest, MemoryLimitTest) {
|
|
|
|
Status s;
|
|
|
|
// The header size is 12 bytes. The two Puts take 8 bytes which gives total
|
|
|
|
// of 12 + 8 * 2 = 28 bytes.
|
|
|
|
WriteBatch batch(0, 28);
|
|
|
|
|
|
|
|
ASSERT_OK(batch.Put("a", "...."));
|
|
|
|
ASSERT_OK(batch.Put("b", "...."));
|
|
|
|
s = batch.Put("c", "....");
|
|
|
|
ASSERT_TRUE(s.IsMemoryLimit());
|
|
|
|
}
|
|
|
|
|
2021-12-01 06:31:41 +00:00
|
|
|
namespace {
|
|
|
|
class TimestampChecker : public WriteBatch::Handler {
|
|
|
|
public:
|
|
|
|
explicit TimestampChecker(
|
|
|
|
std::unordered_map<uint32_t, const Comparator*> cf_to_ucmps, Slice ts)
|
|
|
|
: cf_to_ucmps_(std::move(cf_to_ucmps)), timestamp_(std::move(ts)) {}
|
|
|
|
Status PutCF(uint32_t cf, const Slice& key, const Slice& /*value*/) override {
|
|
|
|
auto cf_iter = cf_to_ucmps_.find(cf);
|
|
|
|
if (cf_iter == cf_to_ucmps_.end()) {
|
|
|
|
return Status::Corruption();
|
|
|
|
}
|
|
|
|
const Comparator* const ucmp = cf_iter->second;
|
|
|
|
assert(ucmp);
|
|
|
|
size_t ts_sz = ucmp->timestamp_size();
|
|
|
|
if (ts_sz == 0) {
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
if (key.size() < ts_sz) {
|
|
|
|
return Status::Corruption();
|
|
|
|
}
|
|
|
|
Slice ts = ExtractTimestampFromUserKey(key, ts_sz);
|
|
|
|
if (ts.compare(timestamp_) != 0) {
|
|
|
|
return Status::Corruption();
|
|
|
|
}
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
std::unordered_map<uint32_t, const Comparator*> cf_to_ucmps_;
|
|
|
|
Slice timestamp_;
|
|
|
|
};
|
|
|
|
|
|
|
|
Status CheckTimestampsInWriteBatch(
|
|
|
|
WriteBatch& wb, Slice timestamp,
|
|
|
|
std::unordered_map<uint32_t, const Comparator*> cf_to_ucmps) {
|
|
|
|
TimestampChecker ts_checker(cf_to_ucmps, timestamp);
|
|
|
|
return wb.Iterate(&ts_checker);
|
|
|
|
}
|
2022-11-02 21:34:24 +00:00
|
|
|
} // anonymous namespace
|
2021-12-01 06:31:41 +00:00
|
|
|
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
TEST_F(WriteBatchTest, SanityChecks) {
|
2022-02-08 20:14:25 +00:00
|
|
|
ColumnFamilyHandleImplDummy cf0(0,
|
|
|
|
test::BytewiseComparatorWithU64TsWrapper());
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
ColumnFamilyHandleImplDummy cf4(4);
|
|
|
|
|
|
|
|
WriteBatch wb(0, 0, 0, /*default_cf_ts_sz=*/sizeof(uint64_t));
|
|
|
|
|
|
|
|
// Sanity checks for the new WriteBatch APIs with extra 'ts' arg.
|
|
|
|
ASSERT_TRUE(wb.Put(nullptr, "key", "ts", "value").IsInvalidArgument());
|
|
|
|
ASSERT_TRUE(wb.Delete(nullptr, "key", "ts").IsInvalidArgument());
|
|
|
|
ASSERT_TRUE(wb.SingleDelete(nullptr, "key", "ts").IsInvalidArgument());
|
2022-11-01 05:28:58 +00:00
|
|
|
ASSERT_TRUE(wb.Merge(nullptr, "key", "ts", "value").IsInvalidArgument());
|
2022-09-30 23:13:03 +00:00
|
|
|
ASSERT_TRUE(wb.DeleteRange(nullptr, "begin_key", "end_key", "ts")
|
|
|
|
.IsInvalidArgument());
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
|
|
|
|
ASSERT_TRUE(wb.Put(&cf4, "key", "ts", "value").IsInvalidArgument());
|
|
|
|
ASSERT_TRUE(wb.Delete(&cf4, "key", "ts").IsInvalidArgument());
|
|
|
|
ASSERT_TRUE(wb.SingleDelete(&cf4, "key", "ts").IsInvalidArgument());
|
2022-11-01 05:28:58 +00:00
|
|
|
ASSERT_TRUE(wb.Merge(&cf4, "key", "ts", "value").IsInvalidArgument());
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
ASSERT_TRUE(
|
2022-09-30 23:13:03 +00:00
|
|
|
wb.DeleteRange(&cf4, "begin_key", "end_key", "ts").IsInvalidArgument());
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
|
|
|
|
constexpr size_t wrong_ts_sz = 1 + sizeof(uint64_t);
|
|
|
|
std::string ts(wrong_ts_sz, '\0');
|
|
|
|
|
|
|
|
ASSERT_TRUE(wb.Put(&cf0, "key", ts, "value").IsInvalidArgument());
|
|
|
|
ASSERT_TRUE(wb.Delete(&cf0, "key", ts).IsInvalidArgument());
|
|
|
|
ASSERT_TRUE(wb.SingleDelete(&cf0, "key", ts).IsInvalidArgument());
|
2022-11-01 05:28:58 +00:00
|
|
|
ASSERT_TRUE(wb.Merge(&cf0, "key", ts, "value").IsInvalidArgument());
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
ASSERT_TRUE(
|
2022-09-30 23:13:03 +00:00
|
|
|
wb.DeleteRange(&cf0, "begin_key", "end_key", ts).IsInvalidArgument());
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
|
|
|
|
// Sanity checks for the new WriteBatch APIs without extra 'ts' arg.
|
|
|
|
WriteBatch wb1(0, 0, 0, wrong_ts_sz);
|
|
|
|
ASSERT_TRUE(wb1.Put(&cf0, "key", "value").IsInvalidArgument());
|
|
|
|
ASSERT_TRUE(wb1.Delete(&cf0, "key").IsInvalidArgument());
|
|
|
|
ASSERT_TRUE(wb1.SingleDelete(&cf0, "key").IsInvalidArgument());
|
|
|
|
ASSERT_TRUE(wb1.Merge(&cf0, "key", "value").IsInvalidArgument());
|
|
|
|
ASSERT_TRUE(
|
|
|
|
wb1.DeleteRange(&cf0, "begin_key", "end_key").IsInvalidArgument());
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(WriteBatchTest, UpdateTimestamps) {
|
2021-12-01 06:31:41 +00:00
|
|
|
// We assume the last eight bytes of each key is reserved for timestamps.
|
|
|
|
// Therefore, we must make sure each key is longer than eight bytes.
|
|
|
|
constexpr size_t key_size = 16;
|
|
|
|
constexpr size_t num_of_keys = 10;
|
|
|
|
std::vector<std::string> key_strs(num_of_keys, std::string(key_size, '\0'));
|
|
|
|
|
|
|
|
ColumnFamilyHandleImplDummy cf0(0);
|
2022-02-08 20:14:25 +00:00
|
|
|
ColumnFamilyHandleImplDummy cf4(4,
|
|
|
|
test::BytewiseComparatorWithU64TsWrapper());
|
|
|
|
ColumnFamilyHandleImplDummy cf5(5,
|
|
|
|
test::BytewiseComparatorWithU64TsWrapper());
|
2021-12-01 06:31:41 +00:00
|
|
|
|
|
|
|
const std::unordered_map<uint32_t, const Comparator*> cf_to_ucmps = {
|
|
|
|
{0, cf0.GetComparator()},
|
|
|
|
{4, cf4.GetComparator()},
|
|
|
|
{5, cf5.GetComparator()}};
|
|
|
|
|
2022-02-22 22:19:02 +00:00
|
|
|
static constexpr size_t timestamp_size = sizeof(uint64_t);
|
|
|
|
|
|
|
|
{
|
|
|
|
WriteBatch wb1, wb2, wb3, wb4, wb5, wb6, wb7;
|
|
|
|
ASSERT_OK(wb1.Put(&cf0, "key", "value"));
|
|
|
|
ASSERT_FALSE(WriteBatchInternal::HasKeyWithTimestamp(wb1));
|
|
|
|
ASSERT_OK(wb2.Put(&cf4, "key", "value"));
|
|
|
|
ASSERT_TRUE(WriteBatchInternal::HasKeyWithTimestamp(wb2));
|
|
|
|
ASSERT_OK(wb3.Put(&cf4, "key", /*ts=*/std::string(timestamp_size, '\xfe'),
|
|
|
|
"value"));
|
|
|
|
ASSERT_TRUE(WriteBatchInternal::HasKeyWithTimestamp(wb3));
|
|
|
|
ASSERT_OK(wb4.Delete(&cf4, "key",
|
|
|
|
/*ts=*/std::string(timestamp_size, '\xfe')));
|
|
|
|
ASSERT_TRUE(WriteBatchInternal::HasKeyWithTimestamp(wb4));
|
|
|
|
ASSERT_OK(wb5.Delete(&cf4, "key"));
|
|
|
|
ASSERT_TRUE(WriteBatchInternal::HasKeyWithTimestamp(wb5));
|
|
|
|
ASSERT_OK(wb6.SingleDelete(&cf4, "key"));
|
|
|
|
ASSERT_TRUE(WriteBatchInternal::HasKeyWithTimestamp(wb6));
|
|
|
|
ASSERT_OK(wb7.SingleDelete(&cf4, "key",
|
|
|
|
/*ts=*/std::string(timestamp_size, '\xfe')));
|
|
|
|
ASSERT_TRUE(WriteBatchInternal::HasKeyWithTimestamp(wb7));
|
|
|
|
}
|
|
|
|
|
2021-12-01 06:31:41 +00:00
|
|
|
WriteBatch batch;
|
|
|
|
// Write to the batch. We will assign timestamps later.
|
|
|
|
for (const auto& key_str : key_strs) {
|
|
|
|
ASSERT_OK(batch.Put(&cf0, key_str, "value"));
|
|
|
|
ASSERT_OK(batch.Put(&cf4, key_str, "value"));
|
|
|
|
ASSERT_OK(batch.Put(&cf5, key_str, "value"));
|
|
|
|
}
|
|
|
|
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
const auto checker1 = [](uint32_t cf) {
|
2021-12-01 06:31:41 +00:00
|
|
|
if (cf == 4 || cf == 5) {
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
return timestamp_size;
|
2021-12-01 06:31:41 +00:00
|
|
|
} else if (cf == 0) {
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
return static_cast<size_t>(0);
|
2021-12-01 06:31:41 +00:00
|
|
|
} else {
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
return std::numeric_limits<size_t>::max();
|
2021-12-01 06:31:41 +00:00
|
|
|
}
|
|
|
|
};
|
|
|
|
ASSERT_OK(
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
batch.UpdateTimestamps(std::string(timestamp_size, '\xfe'), checker1));
|
2021-12-01 06:31:41 +00:00
|
|
|
ASSERT_OK(CheckTimestampsInWriteBatch(
|
|
|
|
batch, std::string(timestamp_size, '\xfe'), cf_to_ucmps));
|
|
|
|
|
|
|
|
// We use indexed_cf_to_ucmps, non_indexed_cfs_with_ts and timestamp_size to
|
|
|
|
// simulate the case in which a transaction enables indexing for some writes
|
|
|
|
// while disables indexing for other writes. A transaction uses a
|
|
|
|
// WriteBatchWithIndex object to buffer writes (we consider Write-committed
|
|
|
|
// policy only). If indexing is enabled, then writes go through
|
|
|
|
// WriteBatchWithIndex API populating a WBWI internal data structure, i.e. a
|
|
|
|
// mapping from cf to user comparators. If indexing is disabled, a transaction
|
|
|
|
// writes directly to the underlying raw WriteBatch. We will need to track the
|
|
|
|
// comparator information for the column families to which un-indexed writes
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
// are performed. When calling UpdateTimestamp API of WriteBatch, we need
|
2021-12-01 06:31:41 +00:00
|
|
|
// indexed_cf_to_ucmps, non_indexed_cfs_with_ts, and timestamp_size to perform
|
|
|
|
// checking.
|
|
|
|
std::unordered_map<uint32_t, const Comparator*> indexed_cf_to_ucmps = {
|
|
|
|
{0, cf0.GetComparator()}, {4, cf4.GetComparator()}};
|
|
|
|
std::unordered_set<uint32_t> non_indexed_cfs_with_ts = {cf5.GetID()};
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
const auto checker2 = [&indexed_cf_to_ucmps,
|
|
|
|
&non_indexed_cfs_with_ts](uint32_t cf) {
|
2021-12-01 06:31:41 +00:00
|
|
|
if (non_indexed_cfs_with_ts.count(cf) > 0) {
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
return timestamp_size;
|
2021-12-01 06:31:41 +00:00
|
|
|
}
|
|
|
|
auto cf_iter = indexed_cf_to_ucmps.find(cf);
|
|
|
|
if (cf_iter == indexed_cf_to_ucmps.end()) {
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
assert(false);
|
|
|
|
return std::numeric_limits<size_t>::max();
|
2021-12-01 06:31:41 +00:00
|
|
|
}
|
|
|
|
const Comparator* const ucmp = cf_iter->second;
|
|
|
|
assert(ucmp);
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
return ucmp->timestamp_size();
|
2021-12-01 06:31:41 +00:00
|
|
|
};
|
|
|
|
ASSERT_OK(
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
batch.UpdateTimestamps(std::string(timestamp_size, '\xef'), checker2));
|
2021-12-01 06:31:41 +00:00
|
|
|
ASSERT_OK(CheckTimestampsInWriteBatch(
|
|
|
|
batch, std::string(timestamp_size, '\xef'), cf_to_ucmps));
|
|
|
|
}
|
|
|
|
|
2021-12-10 19:03:39 +00:00
|
|
|
TEST_F(WriteBatchTest, CommitWithTimestamp) {
|
|
|
|
WriteBatch wb;
|
|
|
|
const std::string txn_name = "xid1";
|
|
|
|
std::string ts;
|
|
|
|
constexpr uint64_t commit_ts = 23;
|
|
|
|
PutFixed64(&ts, commit_ts);
|
|
|
|
ASSERT_OK(WriteBatchInternal::MarkCommitWithTimestamp(&wb, txn_name, ts));
|
|
|
|
TestHandler handler;
|
|
|
|
ASSERT_OK(wb.Iterate(&handler));
|
|
|
|
ASSERT_EQ("MarkCommitWithTimestamp(" + txn_name + ", " +
|
|
|
|
Slice(ts).ToString(true) + ")",
|
|
|
|
handler.seen);
|
|
|
|
}
|
|
|
|
|
2020-02-20 20:07:53 +00:00
|
|
|
} // namespace ROCKSDB_NAMESPACE
|
2011-03-18 22:37:00 +00:00
|
|
|
|
|
|
|
int main(int argc, char** argv) {
|
2022-10-18 07:35:35 +00:00
|
|
|
ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
|
2015-03-17 21:08:00 +00:00
|
|
|
::testing::InitGoogleTest(&argc, argv);
|
|
|
|
return RUN_ALL_TESTS();
|
2011-03-18 22:37:00 +00:00
|
|
|
}
|