2020-03-13 18:33:04 +00:00
|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
//
|
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
|
2022-05-20 01:39:41 +00:00
|
|
|
#include "db/db_with_timestamp_test_util.h"
|
2020-03-13 18:33:04 +00:00
|
|
|
#include "port/stack_trace.h"
|
|
|
|
#include "rocksdb/perf_context.h"
|
|
|
|
#include "rocksdb/utilities/debug.h"
|
|
|
|
#include "table/block_based/block_based_table_reader.h"
|
|
|
|
#include "table/block_based/block_builder.h"
|
|
|
|
#include "test_util/sync_point.h"
|
2021-09-08 14:45:59 +00:00
|
|
|
#include "test_util/testutil.h"
|
2020-07-09 21:33:42 +00:00
|
|
|
#include "utilities/fault_injection_env.h"
|
2022-11-01 05:28:58 +00:00
|
|
|
#include "utilities/merge_operators/string_append/stringappend2.h"
|
2020-03-13 18:33:04 +00:00
|
|
|
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
class DBBasicTestWithTimestamp : public DBBasicTestWithTimestampBase {
|
|
|
|
public:
|
|
|
|
DBBasicTestWithTimestamp()
|
|
|
|
: DBBasicTestWithTimestampBase("db_basic_test_with_timestamp") {}
|
|
|
|
};
|
|
|
|
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
TEST_F(DBBasicTestWithTimestamp, SanityChecks) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.env = env_;
|
|
|
|
options.create_if_missing = true;
|
|
|
|
options.avoid_flush_during_shutdown = true;
|
|
|
|
options.merge_operator = MergeOperators::CreateStringAppendTESTOperator();
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
Options options1 = CurrentOptions();
|
|
|
|
options1.env = env_;
|
2022-02-08 20:14:25 +00:00
|
|
|
options1.comparator = test::BytewiseComparatorWithU64TsWrapper();
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
options1.merge_operator = MergeOperators::CreateStringAppendTESTOperator();
|
|
|
|
assert(options1.comparator &&
|
|
|
|
options1.comparator->timestamp_size() == sizeof(uint64_t));
|
|
|
|
ColumnFamilyHandle* handle = nullptr;
|
|
|
|
Status s = db_->CreateColumnFamily(options1, "data", &handle);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
std::string dummy_ts(sizeof(uint64_t), '\0');
|
|
|
|
// Perform timestamp operations on default cf.
|
|
|
|
ASSERT_TRUE(
|
|
|
|
db_->Put(WriteOptions(), "key", dummy_ts, "value").IsInvalidArgument());
|
|
|
|
ASSERT_TRUE(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), "key",
|
|
|
|
dummy_ts, "value")
|
2022-11-01 05:28:58 +00:00
|
|
|
.IsInvalidArgument());
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
ASSERT_TRUE(db_->Delete(WriteOptions(), "key", dummy_ts).IsInvalidArgument());
|
|
|
|
ASSERT_TRUE(
|
|
|
|
db_->SingleDelete(WriteOptions(), "key", dummy_ts).IsInvalidArgument());
|
|
|
|
ASSERT_TRUE(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
|
|
|
|
"begin_key", "end_key", dummy_ts)
|
2022-09-30 23:13:03 +00:00
|
|
|
.IsInvalidArgument());
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
|
|
|
|
// Perform non-timestamp operations on "data" cf.
|
|
|
|
ASSERT_TRUE(
|
|
|
|
db_->Put(WriteOptions(), handle, "key", "value").IsInvalidArgument());
|
|
|
|
ASSERT_TRUE(db_->Delete(WriteOptions(), handle, "key").IsInvalidArgument());
|
|
|
|
ASSERT_TRUE(
|
|
|
|
db_->SingleDelete(WriteOptions(), handle, "key").IsInvalidArgument());
|
|
|
|
|
|
|
|
ASSERT_TRUE(
|
|
|
|
db_->Merge(WriteOptions(), handle, "key", "value").IsInvalidArgument());
|
|
|
|
ASSERT_TRUE(db_->DeleteRange(WriteOptions(), handle, "begin_key", "end_key")
|
|
|
|
.IsInvalidArgument());
|
|
|
|
|
|
|
|
{
|
|
|
|
WriteBatch wb;
|
|
|
|
ASSERT_OK(wb.Put(handle, "key", "value"));
|
|
|
|
ASSERT_TRUE(db_->Write(WriteOptions(), &wb).IsInvalidArgument());
|
|
|
|
}
|
|
|
|
{
|
|
|
|
WriteBatch wb;
|
|
|
|
ASSERT_OK(wb.Delete(handle, "key"));
|
|
|
|
ASSERT_TRUE(db_->Write(WriteOptions(), &wb).IsInvalidArgument());
|
|
|
|
}
|
|
|
|
{
|
|
|
|
WriteBatch wb;
|
|
|
|
ASSERT_OK(wb.SingleDelete(handle, "key"));
|
|
|
|
ASSERT_TRUE(db_->Write(WriteOptions(), &wb).IsInvalidArgument());
|
|
|
|
}
|
2022-09-30 23:13:03 +00:00
|
|
|
{
|
|
|
|
WriteBatch wb;
|
|
|
|
ASSERT_OK(wb.DeleteRange(handle, "begin_key", "end_key"));
|
|
|
|
ASSERT_TRUE(db_->Write(WriteOptions(), &wb).IsInvalidArgument());
|
|
|
|
}
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
|
|
|
|
// Perform timestamp operations with timestamps of incorrect size.
|
|
|
|
const std::string wrong_ts(sizeof(uint32_t), '\0');
|
|
|
|
ASSERT_TRUE(db_->Put(WriteOptions(), handle, "key", wrong_ts, "value")
|
|
|
|
.IsInvalidArgument());
|
|
|
|
ASSERT_TRUE(db_->Merge(WriteOptions(), handle, "key", wrong_ts, "value")
|
2022-11-01 05:28:58 +00:00
|
|
|
.IsInvalidArgument());
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
ASSERT_TRUE(
|
|
|
|
db_->Delete(WriteOptions(), handle, "key", wrong_ts).IsInvalidArgument());
|
|
|
|
ASSERT_TRUE(db_->SingleDelete(WriteOptions(), handle, "key", wrong_ts)
|
|
|
|
.IsInvalidArgument());
|
|
|
|
ASSERT_TRUE(
|
|
|
|
db_->DeleteRange(WriteOptions(), handle, "begin_key", "end_key", wrong_ts)
|
2022-09-30 23:13:03 +00:00
|
|
|
.IsInvalidArgument());
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
|
|
|
|
delete handle;
|
|
|
|
}
|
|
|
|
|
2021-09-12 22:33:15 +00:00
|
|
|
TEST_F(DBBasicTestWithTimestamp, MixedCfs) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.env = env_;
|
|
|
|
options.create_if_missing = true;
|
|
|
|
options.avoid_flush_during_shutdown = true;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
Options options1 = CurrentOptions();
|
|
|
|
options1.env = env_;
|
|
|
|
const size_t kTimestampSize = Timestamp(0, 0).size();
|
|
|
|
TestComparator test_cmp(kTimestampSize);
|
|
|
|
options1.comparator = &test_cmp;
|
|
|
|
ColumnFamilyHandle* handle = nullptr;
|
|
|
|
Status s = db_->CreateColumnFamily(options1, "data", &handle);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
WriteBatch wb;
|
|
|
|
ASSERT_OK(wb.Put("a", "value"));
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
ASSERT_OK(wb.Put(handle, "a", "value"));
|
2021-09-12 22:33:15 +00:00
|
|
|
{
|
|
|
|
std::string ts = Timestamp(1, 0);
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
const auto ts_sz_func = [kTimestampSize, handle](uint32_t cf_id) {
|
|
|
|
assert(handle);
|
|
|
|
if (cf_id == 0) {
|
|
|
|
return static_cast<size_t>(0);
|
|
|
|
} else if (cf_id == handle->GetID()) {
|
|
|
|
return kTimestampSize;
|
|
|
|
} else {
|
|
|
|
assert(false);
|
|
|
|
return std::numeric_limits<size_t>::max();
|
|
|
|
}
|
|
|
|
};
|
|
|
|
ASSERT_OK(wb.UpdateTimestamps(ts, ts_sz_func));
|
2021-09-12 22:33:15 +00:00
|
|
|
ASSERT_OK(db_->Write(WriteOptions(), &wb));
|
|
|
|
}
|
|
|
|
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
const auto verify_db = [this](ColumnFamilyHandle* h, const std::string& key,
|
|
|
|
const std::string& ts,
|
|
|
|
const std::string& expected_value) {
|
|
|
|
ASSERT_EQ(expected_value, Get(key));
|
2021-09-12 22:33:15 +00:00
|
|
|
Slice read_ts_slice(ts);
|
|
|
|
ReadOptions read_opts;
|
|
|
|
read_opts.timestamp = &read_ts_slice;
|
|
|
|
std::string value;
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
ASSERT_OK(db_->Get(read_opts, h, key, &value));
|
|
|
|
ASSERT_EQ(expected_value, value);
|
2021-09-12 22:33:15 +00:00
|
|
|
};
|
|
|
|
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
verify_db(handle, "a", Timestamp(1, 0), "value");
|
2021-09-12 22:33:15 +00:00
|
|
|
|
|
|
|
delete handle;
|
|
|
|
Close();
|
|
|
|
|
|
|
|
std::vector<ColumnFamilyDescriptor> cf_descs;
|
|
|
|
cf_descs.emplace_back(kDefaultColumnFamilyName, options);
|
|
|
|
cf_descs.emplace_back("data", options1);
|
|
|
|
options.create_if_missing = false;
|
|
|
|
s = DB::Open(options, dbname_, cf_descs, &handles_, &db_);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
verify_db(handles_[1], "a", Timestamp(1, 0), "value");
|
2021-09-12 22:33:15 +00:00
|
|
|
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
2024-08-23 20:58:04 +00:00
|
|
|
TEST_F(DBBasicTestWithTimestamp, MultiGetMultipleCfs) {
|
|
|
|
const size_t kTimestampSize = Timestamp(0, 0).size();
|
|
|
|
TestComparator test_cmp(kTimestampSize);
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.env = env_;
|
|
|
|
options.create_if_missing = true;
|
|
|
|
options.avoid_flush_during_shutdown = true;
|
|
|
|
options.comparator = &test_cmp;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
Options options1 = CurrentOptions();
|
|
|
|
options1.env = env_;
|
|
|
|
options1.comparator = &test_cmp;
|
|
|
|
ColumnFamilyHandle* handle = nullptr;
|
|
|
|
Status s = db_->CreateColumnFamily(options1, "data", &handle);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
std::string ts = Timestamp(1, 0);
|
|
|
|
WriteBatch wb(0, 0, 0, kTimestampSize);
|
|
|
|
ASSERT_OK(wb.Put("a", "value"));
|
|
|
|
ASSERT_OK(wb.Put(handle, "a", "value"));
|
|
|
|
const auto ts_sz_func = [kTimestampSize](uint32_t /*cf_id*/) {
|
|
|
|
return kTimestampSize;
|
|
|
|
};
|
|
|
|
ASSERT_OK(wb.UpdateTimestamps(ts, ts_sz_func));
|
|
|
|
ASSERT_OK(db_->Write(WriteOptions(), &wb));
|
|
|
|
|
|
|
|
int num_keys = 2;
|
|
|
|
std::vector<Slice> keys;
|
|
|
|
std::vector<std::string> expected_values;
|
|
|
|
for (int i = 0; i < num_keys; i++) {
|
|
|
|
keys.push_back("a");
|
|
|
|
expected_values.push_back("value");
|
|
|
|
}
|
|
|
|
std::vector<ColumnFamilyHandle*> handles;
|
|
|
|
handles.push_back(db_->DefaultColumnFamily());
|
|
|
|
handles.push_back(handle);
|
|
|
|
|
|
|
|
{
|
|
|
|
Slice read_ts_slice(ts);
|
|
|
|
ReadOptions read_opts;
|
|
|
|
read_opts.timestamp = &read_ts_slice;
|
|
|
|
|
|
|
|
std::vector<PinnableSlice> values;
|
|
|
|
values.resize(num_keys);
|
|
|
|
std::vector<Status> statuses;
|
|
|
|
statuses.resize(num_keys);
|
|
|
|
std::vector<std::string> timestamps;
|
|
|
|
timestamps.resize(num_keys);
|
|
|
|
|
|
|
|
db_->MultiGet(read_opts, num_keys, handles.data(), keys.data(),
|
|
|
|
values.data(), timestamps.data(), statuses.data());
|
|
|
|
|
|
|
|
for (int i = 0; i < num_keys; i++) {
|
|
|
|
ASSERT_OK(statuses[i]);
|
|
|
|
ASSERT_EQ(expected_values[i], values[i].ToString());
|
|
|
|
ASSERT_EQ(ts, timestamps[i]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
delete handle;
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
2020-12-02 20:59:23 +00:00
|
|
|
TEST_F(DBBasicTestWithTimestamp, CompactRangeWithSpecifiedRange) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.env = env_;
|
|
|
|
options.create_if_missing = true;
|
|
|
|
const size_t kTimestampSize = Timestamp(0, 0).size();
|
|
|
|
TestComparator test_cmp(kTimestampSize);
|
|
|
|
options.comparator = &test_cmp;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
WriteOptions write_opts;
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
std::string ts = Timestamp(1, 0);
|
2020-12-02 20:59:23 +00:00
|
|
|
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
ASSERT_OK(db_->Put(write_opts, "foo1", ts, "bar"));
|
2020-12-02 20:59:23 +00:00
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
ASSERT_OK(db_->Put(write_opts, "foo2", ts, "bar"));
|
2020-12-02 20:59:23 +00:00
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
std::string start_str = "foo";
|
|
|
|
std::string end_str = "foo2";
|
|
|
|
Slice start(start_str), end(end_str);
|
|
|
|
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, &end));
|
|
|
|
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
2021-11-09 21:07:33 +00:00
|
|
|
TEST_F(DBBasicTestWithTimestamp, GcPreserveLatestVersionBelowFullHistoryLow) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.env = env_;
|
|
|
|
options.create_if_missing = true;
|
|
|
|
const size_t kTimestampSize = Timestamp(0, 0).size();
|
|
|
|
TestComparator test_cmp(kTimestampSize);
|
|
|
|
options.comparator = &test_cmp;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
std::string ts_str = Timestamp(1, 0);
|
|
|
|
WriteOptions wopts;
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
ASSERT_OK(db_->Put(wopts, "k1", ts_str, "v1"));
|
|
|
|
ASSERT_OK(db_->Put(wopts, "k2", ts_str, "v2"));
|
|
|
|
ASSERT_OK(db_->Put(wopts, "k3", ts_str, "v3"));
|
2021-11-09 21:07:33 +00:00
|
|
|
|
|
|
|
ts_str = Timestamp(2, 0);
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
ASSERT_OK(db_->Delete(wopts, "k3", ts_str));
|
2021-11-09 21:07:33 +00:00
|
|
|
|
|
|
|
ts_str = Timestamp(4, 0);
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
ASSERT_OK(db_->Put(wopts, "k1", ts_str, "v5"));
|
2021-11-09 21:07:33 +00:00
|
|
|
|
2022-09-30 23:13:03 +00:00
|
|
|
ts_str = Timestamp(5, 0);
|
|
|
|
ASSERT_OK(
|
|
|
|
db_->DeleteRange(wopts, db_->DefaultColumnFamily(), "k0", "k9", ts_str));
|
|
|
|
|
2021-11-09 21:07:33 +00:00
|
|
|
ts_str = Timestamp(3, 0);
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
Slice ts = ts_str;
|
2021-11-09 21:07:33 +00:00
|
|
|
CompactRangeOptions cro;
|
|
|
|
cro.full_history_ts_low = &ts;
|
|
|
|
ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
|
|
|
|
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
ReadOptions ropts;
|
|
|
|
ropts.timestamp = &ts;
|
|
|
|
std::string value;
|
|
|
|
Status s = db_->Get(ropts, "k1", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ("v1", value);
|
|
|
|
|
2022-06-04 03:00:42 +00:00
|
|
|
std::string key_ts;
|
|
|
|
ASSERT_TRUE(db_->Get(ropts, "k3", &value, &key_ts).IsNotFound());
|
|
|
|
ASSERT_EQ(Timestamp(2, 0), key_ts);
|
|
|
|
|
2022-09-30 23:13:03 +00:00
|
|
|
ts_str = Timestamp(5, 0);
|
|
|
|
ts = ts_str;
|
|
|
|
ropts.timestamp = &ts;
|
|
|
|
ASSERT_TRUE(db_->Get(ropts, "k2", &value, &key_ts).IsNotFound());
|
|
|
|
ASSERT_EQ(Timestamp(5, 0), key_ts);
|
|
|
|
ASSERT_TRUE(db_->Get(ropts, "k2", &value).IsNotFound());
|
|
|
|
|
2021-11-09 21:07:33 +00:00
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
2021-02-08 21:43:23 +00:00
|
|
|
TEST_F(DBBasicTestWithTimestamp, UpdateFullHistoryTsLow) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.env = env_;
|
|
|
|
options.create_if_missing = true;
|
|
|
|
const size_t kTimestampSize = Timestamp(0, 0).size();
|
|
|
|
TestComparator test_cmp(kTimestampSize);
|
|
|
|
options.comparator = &test_cmp;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
const std::string kKey = "test kKey";
|
|
|
|
|
|
|
|
// Test set ts_low first and flush()
|
|
|
|
int current_ts_low = 5;
|
|
|
|
std::string ts_low_str = Timestamp(current_ts_low, 0);
|
|
|
|
Slice ts_low = ts_low_str;
|
|
|
|
CompactRangeOptions comp_opts;
|
|
|
|
comp_opts.full_history_ts_low = &ts_low;
|
|
|
|
comp_opts.bottommost_level_compaction = BottommostLevelCompaction::kForce;
|
|
|
|
|
|
|
|
ASSERT_OK(db_->CompactRange(comp_opts, nullptr, nullptr));
|
|
|
|
|
|
|
|
auto* cfd =
|
|
|
|
static_cast_with_check<ColumnFamilyHandleImpl>(db_->DefaultColumnFamily())
|
|
|
|
->cfd();
|
|
|
|
auto result_ts_low = cfd->GetFullHistoryTsLow();
|
|
|
|
|
|
|
|
ASSERT_TRUE(test_cmp.CompareTimestamp(ts_low, result_ts_low) == 0);
|
|
|
|
|
|
|
|
for (int i = 0; i < 10; i++) {
|
|
|
|
WriteOptions write_opts;
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
std::string ts = Timestamp(i, 0);
|
|
|
|
ASSERT_OK(db_->Put(write_opts, kKey, ts, Key(i)));
|
2021-02-08 21:43:23 +00:00
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
for (int i = 0; i < 10; i++) {
|
|
|
|
ReadOptions read_opts;
|
|
|
|
std::string ts_str = Timestamp(i, 0);
|
|
|
|
Slice ts = ts_str;
|
|
|
|
read_opts.timestamp = &ts;
|
|
|
|
std::string value;
|
|
|
|
Status status = db_->Get(read_opts, kKey, &value);
|
2022-06-06 21:36:22 +00:00
|
|
|
if (i < current_ts_low) {
|
|
|
|
ASSERT_TRUE(status.IsInvalidArgument());
|
2021-02-08 21:43:23 +00:00
|
|
|
} else {
|
|
|
|
ASSERT_OK(status);
|
|
|
|
ASSERT_TRUE(value.compare(Key(i)) == 0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Test set ts_low and then trigger compaction
|
|
|
|
for (int i = 10; i < 20; i++) {
|
|
|
|
WriteOptions write_opts;
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
std::string ts = Timestamp(i, 0);
|
|
|
|
ASSERT_OK(db_->Put(write_opts, kKey, ts, Key(i)));
|
2021-02-08 21:43:23 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
current_ts_low = 15;
|
|
|
|
ts_low_str = Timestamp(current_ts_low, 0);
|
|
|
|
ts_low = ts_low_str;
|
|
|
|
comp_opts.full_history_ts_low = &ts_low;
|
|
|
|
ASSERT_OK(db_->CompactRange(comp_opts, nullptr, nullptr));
|
|
|
|
result_ts_low = cfd->GetFullHistoryTsLow();
|
|
|
|
ASSERT_TRUE(test_cmp.CompareTimestamp(ts_low, result_ts_low) == 0);
|
|
|
|
|
2021-11-09 21:07:33 +00:00
|
|
|
for (int i = current_ts_low; i < 20; i++) {
|
2021-02-08 21:43:23 +00:00
|
|
|
ReadOptions read_opts;
|
|
|
|
std::string ts_str = Timestamp(i, 0);
|
|
|
|
Slice ts = ts_str;
|
|
|
|
read_opts.timestamp = &ts;
|
|
|
|
std::string value;
|
|
|
|
Status status = db_->Get(read_opts, kKey, &value);
|
2021-11-09 21:07:33 +00:00
|
|
|
ASSERT_OK(status);
|
|
|
|
ASSERT_TRUE(value.compare(Key(i)) == 0);
|
2021-02-08 21:43:23 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Test invalid compaction with range
|
|
|
|
Slice start(kKey), end(kKey);
|
|
|
|
Status s = db_->CompactRange(comp_opts, &start, &end);
|
|
|
|
ASSERT_TRUE(s.IsInvalidArgument());
|
|
|
|
s = db_->CompactRange(comp_opts, &start, nullptr);
|
|
|
|
ASSERT_TRUE(s.IsInvalidArgument());
|
|
|
|
s = db_->CompactRange(comp_opts, nullptr, &end);
|
|
|
|
ASSERT_TRUE(s.IsInvalidArgument());
|
|
|
|
|
|
|
|
// Test invalid compaction with the decreasing ts_low
|
|
|
|
ts_low_str = Timestamp(current_ts_low - 1, 0);
|
|
|
|
ts_low = ts_low_str;
|
|
|
|
comp_opts.full_history_ts_low = &ts_low;
|
|
|
|
s = db_->CompactRange(comp_opts, nullptr, nullptr);
|
|
|
|
ASSERT_TRUE(s.IsInvalidArgument());
|
|
|
|
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
2021-12-23 19:02:43 +00:00
|
|
|
TEST_F(DBBasicTestWithTimestamp, UpdateFullHistoryTsLowWithPublicAPI) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.env = env_;
|
|
|
|
options.create_if_missing = true;
|
|
|
|
const size_t kTimestampSize = Timestamp(0, 0).size();
|
|
|
|
TestComparator test_cmp(kTimestampSize);
|
|
|
|
options.comparator = &test_cmp;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
std::string ts_low_str = Timestamp(9, 0);
|
|
|
|
ASSERT_OK(
|
|
|
|
db_->IncreaseFullHistoryTsLow(db_->DefaultColumnFamily(), ts_low_str));
|
|
|
|
std::string result_ts_low;
|
|
|
|
ASSERT_OK(db_->GetFullHistoryTsLow(nullptr, &result_ts_low));
|
|
|
|
ASSERT_TRUE(test_cmp.CompareTimestamp(ts_low_str, result_ts_low) == 0);
|
|
|
|
// test increase full_history_low backward
|
|
|
|
std::string ts_low_str_back = Timestamp(8, 0);
|
|
|
|
auto s = db_->IncreaseFullHistoryTsLow(db_->DefaultColumnFamily(),
|
|
|
|
ts_low_str_back);
|
|
|
|
ASSERT_EQ(s, Status::InvalidArgument());
|
|
|
|
// test IncreaseFullHistoryTsLow with a timestamp whose length is longger
|
|
|
|
// than the cf's timestamp size
|
|
|
|
std::string ts_low_str_long(Timestamp(0, 0).size() + 1, 'a');
|
|
|
|
s = db_->IncreaseFullHistoryTsLow(db_->DefaultColumnFamily(),
|
|
|
|
ts_low_str_long);
|
|
|
|
ASSERT_EQ(s, Status::InvalidArgument());
|
|
|
|
// test IncreaseFullHistoryTsLow with a timestamp which is null
|
2024-01-05 19:53:57 +00:00
|
|
|
std::string ts_low_str_null;
|
2021-12-23 19:02:43 +00:00
|
|
|
s = db_->IncreaseFullHistoryTsLow(db_->DefaultColumnFamily(),
|
|
|
|
ts_low_str_null);
|
|
|
|
ASSERT_EQ(s, Status::InvalidArgument());
|
|
|
|
// test IncreaseFullHistoryTsLow for a column family that does not enable
|
|
|
|
// timestamp
|
|
|
|
options.comparator = BytewiseComparator();
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
ts_low_str = Timestamp(10, 0);
|
|
|
|
s = db_->IncreaseFullHistoryTsLow(db_->DefaultColumnFamily(), ts_low_str);
|
|
|
|
ASSERT_EQ(s, Status::InvalidArgument());
|
|
|
|
// test GetFullHistoryTsLow for a column family that does not enable
|
|
|
|
// timestamp
|
|
|
|
std::string current_ts_low;
|
|
|
|
s = db_->GetFullHistoryTsLow(db_->DefaultColumnFamily(), ¤t_ts_low);
|
|
|
|
ASSERT_EQ(s, Status::InvalidArgument());
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
2020-12-02 20:59:23 +00:00
|
|
|
TEST_F(DBBasicTestWithTimestamp, GetApproximateSizes) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.write_buffer_size = 100000000; // Large write buffer
|
|
|
|
options.compression = kNoCompression;
|
|
|
|
options.create_if_missing = true;
|
|
|
|
const size_t kTimestampSize = Timestamp(0, 0).size();
|
|
|
|
TestComparator test_cmp(kTimestampSize);
|
|
|
|
options.comparator = &test_cmp;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
auto default_cf = db_->DefaultColumnFamily();
|
|
|
|
|
|
|
|
WriteOptions write_opts;
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
std::string ts = Timestamp(1, 0);
|
2020-12-02 20:59:23 +00:00
|
|
|
|
|
|
|
const int N = 128;
|
|
|
|
Random rnd(301);
|
|
|
|
for (int i = 0; i < N; i++) {
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
ASSERT_OK(db_->Put(write_opts, Key(i), ts, rnd.RandomString(1024)));
|
2020-12-02 20:59:23 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t size;
|
|
|
|
std::string start = Key(50);
|
|
|
|
std::string end = Key(60);
|
|
|
|
Range r(start, end);
|
|
|
|
SizeApproximationOptions size_approx_options;
|
2022-02-03 23:14:17 +00:00
|
|
|
size_approx_options.include_memtables = true;
|
2020-12-02 20:59:23 +00:00
|
|
|
size_approx_options.include_files = true;
|
|
|
|
ASSERT_OK(
|
|
|
|
db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
|
|
|
|
ASSERT_GT(size, 6000);
|
|
|
|
ASSERT_LT(size, 204800);
|
|
|
|
|
|
|
|
// test multiple ranges
|
|
|
|
std::vector<Range> ranges;
|
|
|
|
std::string start_tmp = Key(10);
|
|
|
|
std::string end_tmp = Key(20);
|
2024-01-05 19:53:57 +00:00
|
|
|
ranges.emplace_back(start_tmp, end_tmp);
|
|
|
|
ranges.emplace_back(start, end);
|
2020-12-02 20:59:23 +00:00
|
|
|
uint64_t range_sizes[2];
|
|
|
|
ASSERT_OK(db_->GetApproximateSizes(size_approx_options, default_cf,
|
|
|
|
ranges.data(), 2, range_sizes));
|
|
|
|
|
|
|
|
ASSERT_EQ(range_sizes[1], size);
|
|
|
|
|
|
|
|
// Zero if not including mem table
|
2021-01-06 22:14:01 +00:00
|
|
|
ASSERT_OK(db_->GetApproximateSizes(&r, 1, &size));
|
2020-12-02 20:59:23 +00:00
|
|
|
ASSERT_EQ(size, 0);
|
|
|
|
|
|
|
|
start = Key(500);
|
|
|
|
end = Key(600);
|
|
|
|
r = Range(start, end);
|
|
|
|
ASSERT_OK(
|
|
|
|
db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
|
|
|
|
ASSERT_EQ(size, 0);
|
|
|
|
|
|
|
|
// Test range boundaries
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
ASSERT_OK(db_->Put(write_opts, Key(1000), ts, rnd.RandomString(1024)));
|
2020-12-02 20:59:23 +00:00
|
|
|
// Should include start key
|
|
|
|
start = Key(1000);
|
|
|
|
end = Key(1100);
|
|
|
|
r = Range(start, end);
|
|
|
|
ASSERT_OK(
|
|
|
|
db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
|
|
|
|
ASSERT_GT(size, 0);
|
|
|
|
|
2023-08-11 18:26:38 +00:00
|
|
|
uint64_t total_mem_count;
|
|
|
|
uint64_t total_mem_size;
|
|
|
|
db_->GetApproximateMemTableStats(default_cf, r, &total_mem_count,
|
|
|
|
&total_mem_size);
|
|
|
|
ASSERT_GT(total_mem_count, 0);
|
|
|
|
ASSERT_GT(total_mem_size, 0);
|
|
|
|
|
2020-12-02 20:59:23 +00:00
|
|
|
// Should exclude end key
|
|
|
|
start = Key(900);
|
|
|
|
end = Key(1000);
|
|
|
|
r = Range(start, end);
|
|
|
|
ASSERT_OK(
|
|
|
|
db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
|
|
|
|
ASSERT_EQ(size, 0);
|
|
|
|
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
2021-03-10 19:13:55 +00:00
|
|
|
TEST_F(DBBasicTestWithTimestamp, SimpleIterate) {
|
2020-03-17 17:47:36 +00:00
|
|
|
const int kNumKeysPerFile = 128;
|
|
|
|
const uint64_t kMaxKey = 1024;
|
2020-03-13 18:33:04 +00:00
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.env = env_;
|
|
|
|
options.create_if_missing = true;
|
|
|
|
const size_t kTimestampSize = Timestamp(0, 0).size();
|
|
|
|
TestComparator test_cmp(kTimestampSize);
|
|
|
|
options.comparator = &test_cmp;
|
2021-09-08 14:45:59 +00:00
|
|
|
options.memtable_factory.reset(
|
|
|
|
test::NewSpecialSkipListFactory(kNumKeysPerFile));
|
2020-03-13 18:33:04 +00:00
|
|
|
DestroyAndReopen(options);
|
|
|
|
const std::vector<uint64_t> start_keys = {1, 0};
|
|
|
|
const std::vector<std::string> write_timestamps = {Timestamp(1, 0),
|
|
|
|
Timestamp(3, 0)};
|
|
|
|
const std::vector<std::string> read_timestamps = {Timestamp(2, 0),
|
|
|
|
Timestamp(4, 0)};
|
|
|
|
for (size_t i = 0; i < write_timestamps.size(); ++i) {
|
|
|
|
WriteOptions write_opts;
|
|
|
|
for (uint64_t key = start_keys[i]; key <= kMaxKey; ++key) {
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
Status s = db_->Put(write_opts, Key1(key), write_timestamps[i],
|
|
|
|
"value" + std::to_string(i));
|
2020-03-13 18:33:04 +00:00
|
|
|
ASSERT_OK(s);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
for (size_t i = 0; i < read_timestamps.size(); ++i) {
|
|
|
|
ReadOptions read_opts;
|
|
|
|
Slice read_ts = read_timestamps[i];
|
|
|
|
read_opts.timestamp = &read_ts;
|
|
|
|
std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
|
|
|
|
int count = 0;
|
|
|
|
uint64_t key = 0;
|
2021-03-10 19:13:55 +00:00
|
|
|
// Forward iterate.
|
2020-03-13 18:33:04 +00:00
|
|
|
for (it->Seek(Key1(0)), key = start_keys[i]; it->Valid();
|
|
|
|
it->Next(), ++count, ++key) {
|
2020-08-05 01:50:05 +00:00
|
|
|
CheckIterUserEntry(it.get(), Key1(key), kTypeValue,
|
|
|
|
"value" + std::to_string(i), write_timestamps[i]);
|
2020-03-13 18:33:04 +00:00
|
|
|
}
|
|
|
|
size_t expected_count = kMaxKey - start_keys[i] + 1;
|
|
|
|
ASSERT_EQ(expected_count, count);
|
|
|
|
|
2021-03-10 19:13:55 +00:00
|
|
|
// Backward iterate.
|
|
|
|
count = 0;
|
|
|
|
for (it->SeekForPrev(Key1(kMaxKey)), key = kMaxKey; it->Valid();
|
|
|
|
it->Prev(), ++count, --key) {
|
|
|
|
CheckIterUserEntry(it.get(), Key1(key), kTypeValue,
|
|
|
|
"value" + std::to_string(i), write_timestamps[i]);
|
|
|
|
}
|
2023-10-18 16:38:38 +00:00
|
|
|
ASSERT_OK(it->status());
|
2021-03-10 19:13:55 +00:00
|
|
|
ASSERT_EQ(static_cast<size_t>(kMaxKey) - start_keys[i] + 1, count);
|
|
|
|
|
|
|
|
// SeekToFirst()/SeekToLast() with lower/upper bounds.
|
2020-03-13 18:33:04 +00:00
|
|
|
// Then iter with lower and upper bounds.
|
|
|
|
uint64_t l = 0;
|
|
|
|
uint64_t r = kMaxKey + 1;
|
|
|
|
while (l < r) {
|
|
|
|
std::string lb_str = Key1(l);
|
|
|
|
Slice lb = lb_str;
|
|
|
|
std::string ub_str = Key1(r);
|
|
|
|
Slice ub = ub_str;
|
|
|
|
read_opts.iterate_lower_bound = &lb;
|
|
|
|
read_opts.iterate_upper_bound = &ub;
|
|
|
|
it.reset(db_->NewIterator(read_opts));
|
|
|
|
for (it->SeekToFirst(), key = std::max(l, start_keys[i]), count = 0;
|
|
|
|
it->Valid(); it->Next(), ++key, ++count) {
|
2020-08-05 01:50:05 +00:00
|
|
|
CheckIterUserEntry(it.get(), Key1(key), kTypeValue,
|
|
|
|
"value" + std::to_string(i), write_timestamps[i]);
|
2020-03-13 18:33:04 +00:00
|
|
|
}
|
2023-10-18 16:38:38 +00:00
|
|
|
ASSERT_OK(it->status());
|
2020-03-13 18:33:04 +00:00
|
|
|
ASSERT_EQ(r - std::max(l, start_keys[i]), count);
|
2021-03-10 19:13:55 +00:00
|
|
|
|
|
|
|
for (it->SeekToLast(), key = std::min(r, kMaxKey + 1), count = 0;
|
|
|
|
it->Valid(); it->Prev(), --key, ++count) {
|
|
|
|
CheckIterUserEntry(it.get(), Key1(key - 1), kTypeValue,
|
|
|
|
"value" + std::to_string(i), write_timestamps[i]);
|
|
|
|
}
|
2023-10-18 16:38:38 +00:00
|
|
|
ASSERT_OK(it->status());
|
2020-03-13 18:33:04 +00:00
|
|
|
l += (kMaxKey / 100);
|
|
|
|
r -= (kMaxKey / 100);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
2022-03-12 00:13:23 +00:00
|
|
|
TEST_F(DBBasicTestWithTimestamp, TrimHistoryTest) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.env = env_;
|
|
|
|
options.create_if_missing = true;
|
|
|
|
const size_t kTimestampSize = Timestamp(0, 0).size();
|
|
|
|
TestComparator test_cmp(kTimestampSize);
|
|
|
|
options.comparator = &test_cmp;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
auto check_value_by_ts = [](DB* db, Slice key, std::string readTs,
|
2022-06-04 03:00:42 +00:00
|
|
|
Status status, std::string checkValue,
|
|
|
|
std::string expected_ts) {
|
2022-03-12 00:13:23 +00:00
|
|
|
ReadOptions ropts;
|
|
|
|
Slice ts = readTs;
|
|
|
|
ropts.timestamp = &ts;
|
|
|
|
std::string value;
|
2022-06-04 03:00:42 +00:00
|
|
|
std::string key_ts;
|
|
|
|
Status s = db->Get(ropts, key, &value, &key_ts);
|
2022-03-12 00:13:23 +00:00
|
|
|
ASSERT_TRUE(s == status);
|
|
|
|
if (s.ok()) {
|
|
|
|
ASSERT_EQ(checkValue, value);
|
|
|
|
}
|
2022-06-04 03:00:42 +00:00
|
|
|
if (s.ok() || s.IsNotFound()) {
|
|
|
|
ASSERT_EQ(expected_ts, key_ts);
|
|
|
|
}
|
2022-03-12 00:13:23 +00:00
|
|
|
};
|
|
|
|
// Construct data of different versions with different ts
|
|
|
|
ASSERT_OK(db_->Put(WriteOptions(), "k1", Timestamp(2, 0), "v1"));
|
|
|
|
ASSERT_OK(db_->Put(WriteOptions(), "k1", Timestamp(4, 0), "v2"));
|
|
|
|
ASSERT_OK(db_->Delete(WriteOptions(), "k1", Timestamp(5, 0)));
|
|
|
|
ASSERT_OK(db_->Put(WriteOptions(), "k1", Timestamp(6, 0), "v3"));
|
2022-06-04 03:00:42 +00:00
|
|
|
check_value_by_ts(db_, "k1", Timestamp(7, 0), Status::OK(), "v3",
|
|
|
|
Timestamp(6, 0));
|
2022-03-12 00:13:23 +00:00
|
|
|
ASSERT_OK(Flush());
|
|
|
|
Close();
|
|
|
|
|
|
|
|
ColumnFamilyOptions cf_options(options);
|
|
|
|
std::vector<ColumnFamilyDescriptor> column_families;
|
2024-01-05 19:53:57 +00:00
|
|
|
column_families.emplace_back(kDefaultColumnFamilyName, cf_options);
|
2022-03-12 00:13:23 +00:00
|
|
|
DBOptions db_options(options);
|
|
|
|
|
|
|
|
// Trim data whose version > Timestamp(5, 0), read(k1, ts(7)) <- NOT_FOUND.
|
|
|
|
ASSERT_OK(DB::OpenAndTrimHistory(db_options, dbname_, column_families,
|
|
|
|
&handles_, &db_, Timestamp(5, 0)));
|
2022-06-04 03:00:42 +00:00
|
|
|
check_value_by_ts(db_, "k1", Timestamp(7, 0), Status::NotFound(), "",
|
|
|
|
Timestamp(5, 0));
|
2022-03-12 00:13:23 +00:00
|
|
|
Close();
|
|
|
|
|
|
|
|
// Trim data whose timestamp > Timestamp(4, 0), read(k1, ts(7)) <- v2
|
|
|
|
ASSERT_OK(DB::OpenAndTrimHistory(db_options, dbname_, column_families,
|
|
|
|
&handles_, &db_, Timestamp(4, 0)));
|
2022-06-04 03:00:42 +00:00
|
|
|
check_value_by_ts(db_, "k1", Timestamp(7, 0), Status::OK(), "v2",
|
|
|
|
Timestamp(4, 0));
|
2022-03-12 00:13:23 +00:00
|
|
|
Close();
|
2022-09-30 23:13:03 +00:00
|
|
|
|
|
|
|
Reopen(options);
|
|
|
|
ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "k1",
|
|
|
|
"k3", Timestamp(7, 0)));
|
|
|
|
check_value_by_ts(db_, "k1", Timestamp(8, 0), Status::NotFound(), "",
|
|
|
|
Timestamp(7, 0));
|
|
|
|
Close();
|
|
|
|
// Trim data whose timestamp > Timestamp(6, 0), read(k1, ts(8)) <- v2
|
|
|
|
ASSERT_OK(DB::OpenAndTrimHistory(db_options, dbname_, column_families,
|
|
|
|
&handles_, &db_, Timestamp(6, 0)));
|
|
|
|
check_value_by_ts(db_, "k1", Timestamp(8, 0), Status::OK(), "v2",
|
|
|
|
Timestamp(4, 0));
|
|
|
|
Close();
|
2022-03-12 00:13:23 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBBasicTestWithTimestamp, OpenAndTrimHistoryInvalidOptionTest) {
|
|
|
|
Destroy(last_options_);
|
|
|
|
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.env = env_;
|
|
|
|
options.create_if_missing = true;
|
|
|
|
const size_t kTimestampSize = Timestamp(0, 0).size();
|
|
|
|
TestComparator test_cmp(kTimestampSize);
|
|
|
|
options.comparator = &test_cmp;
|
|
|
|
|
|
|
|
ColumnFamilyOptions cf_options(options);
|
|
|
|
std::vector<ColumnFamilyDescriptor> column_families;
|
2024-01-05 19:53:57 +00:00
|
|
|
column_families.emplace_back(kDefaultColumnFamilyName, cf_options);
|
2022-03-12 00:13:23 +00:00
|
|
|
DBOptions db_options(options);
|
|
|
|
|
|
|
|
// OpenAndTrimHistory should not work with avoid_flush_during_recovery
|
|
|
|
db_options.avoid_flush_during_recovery = true;
|
|
|
|
ASSERT_TRUE(DB::OpenAndTrimHistory(db_options, dbname_, column_families,
|
|
|
|
&handles_, &db_, Timestamp(0, 0))
|
|
|
|
.IsInvalidArgument());
|
|
|
|
}
|
|
|
|
|
2021-11-19 19:36:03 +00:00
|
|
|
TEST_F(DBBasicTestWithTimestamp, GetTimestampTableProperties) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
const size_t kTimestampSize = Timestamp(0, 0).size();
|
|
|
|
TestComparator test_cmp(kTimestampSize);
|
|
|
|
options.comparator = &test_cmp;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
// Create 2 tables
|
|
|
|
for (int table = 0; table < 2; ++table) {
|
|
|
|
for (int i = 0; i < 10; i++) {
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
std::string ts = Timestamp(i, 0);
|
|
|
|
ASSERT_OK(db_->Put(WriteOptions(), "key", ts, Key(i)));
|
2021-11-19 19:36:03 +00:00
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
|
|
|
|
|
|
|
TablePropertiesCollection props;
|
|
|
|
ASSERT_OK(db_->GetPropertiesOfAllTables(&props));
|
|
|
|
ASSERT_EQ(2U, props.size());
|
|
|
|
for (const auto& item : props) {
|
|
|
|
auto& user_collected = item.second->user_collected_properties;
|
|
|
|
ASSERT_TRUE(user_collected.find("rocksdb.timestamp_min") !=
|
|
|
|
user_collected.end());
|
|
|
|
ASSERT_TRUE(user_collected.find("rocksdb.timestamp_max") !=
|
|
|
|
user_collected.end());
|
|
|
|
ASSERT_EQ(user_collected.at("rocksdb.timestamp_min"), Timestamp(0, 0));
|
|
|
|
ASSERT_EQ(user_collected.at("rocksdb.timestamp_max"), Timestamp(9, 0));
|
|
|
|
}
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
Fix a bug in key comparison when index type is kBinarySearchWithFirstKey (#8062)
Summary:
When timestamp is enabled, key comparison should take this into account.
In `BlockBasedTableReader::Get()`, `BlockBasedTableReader::MultiGet()`,
assume the target key is `key`, and the timestamp upper bound is `ts`.
The highest key in current block is (key, ts1), while the lowest key in next
block is (key, ts2).
If
```
ts1 > ts > ts2
```
then
```
(key, ts1) < (key, ts) < (key, ts2)
```
It can be shown that if `Compare()` is used, then we will mistakenly skip the next
block. Instead, we should use `CompareWithoutTimestamp()`.
The majority of this PR makes some existing tests in `db_with_timestamp_basic_test.cc`
parameterized so that different index types can be tested. A new unit test is
also added for more coverage.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8062
Test Plan: make check
Reviewed By: ltamasi
Differential Revision: D27057557
Pulled By: riversand963
fbshipit-source-id: c1062fa7c159ed600a1ad7e461531d52265021f1
2021-03-16 00:43:02 +00:00
|
|
|
class DBBasicTestWithTimestampTableOptions
|
|
|
|
: public DBBasicTestWithTimestampBase,
|
|
|
|
public testing::WithParamInterface<BlockBasedTableOptions::IndexType> {
|
|
|
|
public:
|
|
|
|
explicit DBBasicTestWithTimestampTableOptions()
|
|
|
|
: DBBasicTestWithTimestampBase(
|
|
|
|
"db_basic_test_with_timestamp_table_options") {}
|
|
|
|
};
|
|
|
|
|
|
|
|
INSTANTIATE_TEST_CASE_P(
|
|
|
|
Timestamp, DBBasicTestWithTimestampTableOptions,
|
|
|
|
testing::Values(
|
|
|
|
BlockBasedTableOptions::IndexType::kBinarySearch,
|
|
|
|
BlockBasedTableOptions::IndexType::kHashSearch,
|
|
|
|
BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch,
|
|
|
|
BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey));
|
|
|
|
|
|
|
|
TEST_P(DBBasicTestWithTimestampTableOptions, GetAndMultiGet) {
|
|
|
|
Options options = GetDefaultOptions();
|
|
|
|
options.create_if_missing = true;
|
|
|
|
options.prefix_extractor.reset(NewFixedPrefixTransform(3));
|
|
|
|
options.compression = kNoCompression;
|
|
|
|
BlockBasedTableOptions bbto;
|
|
|
|
bbto.index_type = GetParam();
|
|
|
|
bbto.block_size = 100;
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(bbto));
|
|
|
|
const size_t kTimestampSize = Timestamp(0, 0).size();
|
|
|
|
TestComparator cmp(kTimestampSize);
|
|
|
|
options.comparator = &cmp;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
constexpr uint64_t kNumKeys = 1024;
|
|
|
|
for (uint64_t k = 0; k < kNumKeys; ++k) {
|
|
|
|
WriteOptions write_opts;
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
ASSERT_OK(db_->Put(write_opts, Key1(k), Timestamp(1, 0),
|
|
|
|
"value" + std::to_string(k)));
|
Fix a bug in key comparison when index type is kBinarySearchWithFirstKey (#8062)
Summary:
When timestamp is enabled, key comparison should take this into account.
In `BlockBasedTableReader::Get()`, `BlockBasedTableReader::MultiGet()`,
assume the target key is `key`, and the timestamp upper bound is `ts`.
The highest key in current block is (key, ts1), while the lowest key in next
block is (key, ts2).
If
```
ts1 > ts > ts2
```
then
```
(key, ts1) < (key, ts) < (key, ts2)
```
It can be shown that if `Compare()` is used, then we will mistakenly skip the next
block. Instead, we should use `CompareWithoutTimestamp()`.
The majority of this PR makes some existing tests in `db_with_timestamp_basic_test.cc`
parameterized so that different index types can be tested. A new unit test is
also added for more coverage.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8062
Test Plan: make check
Reviewed By: ltamasi
Differential Revision: D27057557
Pulled By: riversand963
fbshipit-source-id: c1062fa7c159ed600a1ad7e461531d52265021f1
2021-03-16 00:43:02 +00:00
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
{
|
|
|
|
ReadOptions read_opts;
|
|
|
|
read_opts.total_order_seek = true;
|
|
|
|
std::string ts_str = Timestamp(2, 0);
|
|
|
|
Slice ts = ts_str;
|
|
|
|
read_opts.timestamp = &ts;
|
|
|
|
std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
|
|
|
|
// verify Get()
|
|
|
|
for (it->SeekToFirst(); it->Valid(); it->Next()) {
|
|
|
|
std::string value_from_get;
|
|
|
|
std::string key_str(it->key().data(), it->key().size());
|
|
|
|
std::string timestamp;
|
|
|
|
ASSERT_OK(db_->Get(read_opts, key_str, &value_from_get, ×tamp));
|
|
|
|
ASSERT_EQ(it->value(), value_from_get);
|
|
|
|
ASSERT_EQ(Timestamp(1, 0), timestamp);
|
|
|
|
}
|
2023-10-18 16:38:38 +00:00
|
|
|
ASSERT_OK(it->status());
|
Fix a bug in key comparison when index type is kBinarySearchWithFirstKey (#8062)
Summary:
When timestamp is enabled, key comparison should take this into account.
In `BlockBasedTableReader::Get()`, `BlockBasedTableReader::MultiGet()`,
assume the target key is `key`, and the timestamp upper bound is `ts`.
The highest key in current block is (key, ts1), while the lowest key in next
block is (key, ts2).
If
```
ts1 > ts > ts2
```
then
```
(key, ts1) < (key, ts) < (key, ts2)
```
It can be shown that if `Compare()` is used, then we will mistakenly skip the next
block. Instead, we should use `CompareWithoutTimestamp()`.
The majority of this PR makes some existing tests in `db_with_timestamp_basic_test.cc`
parameterized so that different index types can be tested. A new unit test is
also added for more coverage.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8062
Test Plan: make check
Reviewed By: ltamasi
Differential Revision: D27057557
Pulled By: riversand963
fbshipit-source-id: c1062fa7c159ed600a1ad7e461531d52265021f1
2021-03-16 00:43:02 +00:00
|
|
|
|
|
|
|
// verify MultiGet()
|
|
|
|
constexpr uint64_t step = 2;
|
|
|
|
static_assert(0 == (kNumKeys % step),
|
|
|
|
"kNumKeys must be a multiple of step");
|
|
|
|
for (uint64_t k = 0; k < kNumKeys; k += 2) {
|
|
|
|
std::vector<std::string> key_strs;
|
|
|
|
std::vector<Slice> keys;
|
|
|
|
for (size_t i = 0; i < step; ++i) {
|
|
|
|
key_strs.push_back(Key1(k + i));
|
|
|
|
}
|
|
|
|
for (size_t i = 0; i < step; ++i) {
|
|
|
|
keys.emplace_back(key_strs[i]);
|
|
|
|
}
|
|
|
|
std::vector<std::string> values;
|
|
|
|
std::vector<std::string> timestamps;
|
|
|
|
std::vector<Status> statuses =
|
|
|
|
db_->MultiGet(read_opts, keys, &values, ×tamps);
|
|
|
|
ASSERT_EQ(step, statuses.size());
|
|
|
|
ASSERT_EQ(step, values.size());
|
|
|
|
ASSERT_EQ(step, timestamps.size());
|
|
|
|
for (uint64_t i = 0; i < step; ++i) {
|
|
|
|
ASSERT_OK(statuses[i]);
|
|
|
|
ASSERT_EQ("value" + std::to_string(k + i), values[i]);
|
|
|
|
ASSERT_EQ(Timestamp(1, 0), timestamps[i]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(DBBasicTestWithTimestampTableOptions, SeekWithPrefixLessThanKey) {
|
2020-11-10 22:51:38 +00:00
|
|
|
Options options = CurrentOptions();
|
Steps toward deprecating implicit prefix seek, related fixes (#13026)
Summary:
With some new use cases onboarding to prefix extractors/seek/filters, one of the risks is existing iterator code, e.g. for maintenance tasks, being unintentionally subject to prefix seek semantics. This is a longstanding known design flaw with prefix seek, and `prefix_same_as_start` and `auto_prefix_mode` were steps in the direction of making that obsolete. However, we can't just immediately set `total_order_seek` to true by default, because that would impact so much code instantly.
Here we add a new DB option, `prefix_seek_opt_in_only` that basically allows users to transition to the future behavior when they are ready. When set to true, all iterators will be treated as if `total_order_seek=true` and then the only ways to get prefix seek semantics are with `prefix_same_as_start` or `auto_prefix_mode`.
Related fixes / changes:
* Make sure that `prefix_same_as_start` and `auto_prefix_mode` are compatible with (or override) `total_order_seek` (depending on your interpretation).
* Fix a bug in which a new iterator after dynamically changing the prefix extractor might mix different prefix semantics between memtable and SSTs. Both should use the latest extractor semantics, which means iterators ignoring memtable prefix filters with an old extractor. And that means passing the latest prefix extractor to new memtable iterators that might use prefix seek. (Without the fix, the test added for this fails in many ways.)
Suggested follow-up:
* Investigate a FIXME where a MergeIteratorBuilder is created in db_impl.cc. No unit test detects a change in value that should impact correctness.
* Make memtable prefix bloom compatible with `auto_prefix_mode`, which might require involving the memtablereps because we don't know at iterator creation time (only seek time) whether an auto_prefix_mode seek will be a prefix seek.
* Add `prefix_same_as_start` testing to db_stress
Pull Request resolved: https://github.com/facebook/rocksdb/pull/13026
Test Plan:
tests updated, added. Add combination of `total_order_seek=true` and `auto_prefix_mode=true` to stress test. Ran `make blackbox_crash_test` for a long while.
Manually ran tests with `prefix_seek_opt_in_only=true` as default, looking for unexpected issues. I inspected most of the results and migrated many tests to be ready for such a change (but not all).
Reviewed By: ltamasi
Differential Revision: D63147378
Pulled By: pdillinger
fbshipit-source-id: 1f4477b730683d43b4be7e933338583702d3c25e
2024-09-20 22:54:19 +00:00
|
|
|
options.prefix_seek_opt_in_only = false; // Use legacy prefix seek
|
2020-11-10 22:51:38 +00:00
|
|
|
options.env = env_;
|
|
|
|
options.create_if_missing = true;
|
|
|
|
options.prefix_extractor.reset(NewFixedPrefixTransform(3));
|
2020-12-01 22:05:19 +00:00
|
|
|
options.memtable_whole_key_filtering = true;
|
|
|
|
options.memtable_prefix_bloom_size_ratio = 0.1;
|
2020-11-10 22:51:38 +00:00
|
|
|
BlockBasedTableOptions bbto;
|
|
|
|
bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
|
|
|
|
bbto.cache_index_and_filter_blocks = true;
|
|
|
|
bbto.whole_key_filtering = true;
|
Fix a bug in key comparison when index type is kBinarySearchWithFirstKey (#8062)
Summary:
When timestamp is enabled, key comparison should take this into account.
In `BlockBasedTableReader::Get()`, `BlockBasedTableReader::MultiGet()`,
assume the target key is `key`, and the timestamp upper bound is `ts`.
The highest key in current block is (key, ts1), while the lowest key in next
block is (key, ts2).
If
```
ts1 > ts > ts2
```
then
```
(key, ts1) < (key, ts) < (key, ts2)
```
It can be shown that if `Compare()` is used, then we will mistakenly skip the next
block. Instead, we should use `CompareWithoutTimestamp()`.
The majority of this PR makes some existing tests in `db_with_timestamp_basic_test.cc`
parameterized so that different index types can be tested. A new unit test is
also added for more coverage.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8062
Test Plan: make check
Reviewed By: ltamasi
Differential Revision: D27057557
Pulled By: riversand963
fbshipit-source-id: c1062fa7c159ed600a1ad7e461531d52265021f1
2021-03-16 00:43:02 +00:00
|
|
|
bbto.index_type = GetParam();
|
2020-11-10 22:51:38 +00:00
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(bbto));
|
|
|
|
const size_t kTimestampSize = Timestamp(0, 0).size();
|
|
|
|
TestComparator test_cmp(kTimestampSize);
|
|
|
|
options.comparator = &test_cmp;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
WriteOptions write_opts;
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
std::string ts = Timestamp(1, 0);
|
2020-11-10 22:51:38 +00:00
|
|
|
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
ASSERT_OK(db_->Put(write_opts, "foo1", ts, "bar"));
|
2020-12-02 20:59:23 +00:00
|
|
|
ASSERT_OK(Flush());
|
2020-11-10 22:51:38 +00:00
|
|
|
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
ASSERT_OK(db_->Put(write_opts, "foo2", ts, "bar"));
|
2020-12-02 20:59:23 +00:00
|
|
|
ASSERT_OK(Flush());
|
2020-11-10 22:51:38 +00:00
|
|
|
|
|
|
|
// Move sst file to next level
|
|
|
|
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
|
|
|
|
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
ASSERT_OK(db_->Put(write_opts, "foo3", ts, "bar"));
|
2020-12-02 20:59:23 +00:00
|
|
|
ASSERT_OK(Flush());
|
2020-11-10 22:51:38 +00:00
|
|
|
|
|
|
|
ReadOptions read_opts;
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
Slice read_ts = ts;
|
|
|
|
read_opts.timestamp = &read_ts;
|
2020-11-10 22:51:38 +00:00
|
|
|
{
|
|
|
|
std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
|
|
|
|
iter->Seek("foo");
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_OK(iter->status());
|
2020-12-01 22:05:19 +00:00
|
|
|
iter->Next();
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
|
2020-11-10 22:51:38 +00:00
|
|
|
iter->Seek("bbb");
|
|
|
|
ASSERT_FALSE(iter->Valid());
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
}
|
|
|
|
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
2021-12-21 19:29:52 +00:00
|
|
|
TEST_P(DBBasicTestWithTimestampTableOptions, SeekWithCappedPrefix) {
|
2020-11-10 22:51:38 +00:00
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.env = env_;
|
|
|
|
options.create_if_missing = true;
|
2021-12-21 19:29:52 +00:00
|
|
|
// All of the keys or this test must be longer than 3 characters
|
|
|
|
constexpr int kMinKeyLen = 3;
|
|
|
|
options.prefix_extractor.reset(NewCappedPrefixTransform(kMinKeyLen));
|
2020-12-01 22:05:19 +00:00
|
|
|
options.memtable_whole_key_filtering = true;
|
|
|
|
options.memtable_prefix_bloom_size_ratio = 0.1;
|
2020-11-10 22:51:38 +00:00
|
|
|
BlockBasedTableOptions bbto;
|
|
|
|
bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
|
|
|
|
bbto.cache_index_and_filter_blocks = true;
|
|
|
|
bbto.whole_key_filtering = true;
|
Fix a bug in key comparison when index type is kBinarySearchWithFirstKey (#8062)
Summary:
When timestamp is enabled, key comparison should take this into account.
In `BlockBasedTableReader::Get()`, `BlockBasedTableReader::MultiGet()`,
assume the target key is `key`, and the timestamp upper bound is `ts`.
The highest key in current block is (key, ts1), while the lowest key in next
block is (key, ts2).
If
```
ts1 > ts > ts2
```
then
```
(key, ts1) < (key, ts) < (key, ts2)
```
It can be shown that if `Compare()` is used, then we will mistakenly skip the next
block. Instead, we should use `CompareWithoutTimestamp()`.
The majority of this PR makes some existing tests in `db_with_timestamp_basic_test.cc`
parameterized so that different index types can be tested. A new unit test is
also added for more coverage.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8062
Test Plan: make check
Reviewed By: ltamasi
Differential Revision: D27057557
Pulled By: riversand963
fbshipit-source-id: c1062fa7c159ed600a1ad7e461531d52265021f1
2021-03-16 00:43:02 +00:00
|
|
|
bbto.index_type = GetParam();
|
2020-11-10 22:51:38 +00:00
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(bbto));
|
|
|
|
const size_t kTimestampSize = Timestamp(0, 0).size();
|
|
|
|
TestComparator test_cmp(kTimestampSize);
|
|
|
|
options.comparator = &test_cmp;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
WriteOptions write_opts;
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
std::string ts = Timestamp(1, 0);
|
2020-11-10 22:51:38 +00:00
|
|
|
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
ASSERT_OK(db_->Put(write_opts, "foo1", ts, "bar"));
|
2020-12-02 20:59:23 +00:00
|
|
|
ASSERT_OK(Flush());
|
2020-11-10 22:51:38 +00:00
|
|
|
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
ASSERT_OK(db_->Put(write_opts, "foo2", ts, "bar"));
|
2020-12-02 20:59:23 +00:00
|
|
|
ASSERT_OK(Flush());
|
2020-11-10 22:51:38 +00:00
|
|
|
|
|
|
|
// Move sst file to next level
|
|
|
|
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
|
|
|
|
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
ASSERT_OK(db_->Put(write_opts, "foo3", ts, "bar"));
|
2020-12-02 20:59:23 +00:00
|
|
|
ASSERT_OK(Flush());
|
2020-11-10 22:51:38 +00:00
|
|
|
|
|
|
|
ReadOptions read_opts;
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
ts = Timestamp(2, 0);
|
|
|
|
Slice read_ts = ts;
|
|
|
|
read_opts.timestamp = &read_ts;
|
2020-12-01 22:05:19 +00:00
|
|
|
{
|
|
|
|
std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
|
|
|
|
// Make sure the prefix extractor doesn't include timestamp, otherwise it
|
|
|
|
// may return invalid result.
|
|
|
|
iter->Seek("foo");
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
}
|
|
|
|
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
Fix a bug in key comparison when index type is kBinarySearchWithFirstKey (#8062)
Summary:
When timestamp is enabled, key comparison should take this into account.
In `BlockBasedTableReader::Get()`, `BlockBasedTableReader::MultiGet()`,
assume the target key is `key`, and the timestamp upper bound is `ts`.
The highest key in current block is (key, ts1), while the lowest key in next
block is (key, ts2).
If
```
ts1 > ts > ts2
```
then
```
(key, ts1) < (key, ts) < (key, ts2)
```
It can be shown that if `Compare()` is used, then we will mistakenly skip the next
block. Instead, we should use `CompareWithoutTimestamp()`.
The majority of this PR makes some existing tests in `db_with_timestamp_basic_test.cc`
parameterized so that different index types can be tested. A new unit test is
also added for more coverage.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8062
Test Plan: make check
Reviewed By: ltamasi
Differential Revision: D27057557
Pulled By: riversand963
fbshipit-source-id: c1062fa7c159ed600a1ad7e461531d52265021f1
2021-03-16 00:43:02 +00:00
|
|
|
TEST_P(DBBasicTestWithTimestampTableOptions, SeekWithBound) {
|
2020-12-01 22:05:19 +00:00
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.env = env_;
|
|
|
|
options.create_if_missing = true;
|
|
|
|
options.prefix_extractor.reset(NewFixedPrefixTransform(2));
|
|
|
|
BlockBasedTableOptions bbto;
|
|
|
|
bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
|
|
|
|
bbto.cache_index_and_filter_blocks = true;
|
|
|
|
bbto.whole_key_filtering = true;
|
Fix a bug in key comparison when index type is kBinarySearchWithFirstKey (#8062)
Summary:
When timestamp is enabled, key comparison should take this into account.
In `BlockBasedTableReader::Get()`, `BlockBasedTableReader::MultiGet()`,
assume the target key is `key`, and the timestamp upper bound is `ts`.
The highest key in current block is (key, ts1), while the lowest key in next
block is (key, ts2).
If
```
ts1 > ts > ts2
```
then
```
(key, ts1) < (key, ts) < (key, ts2)
```
It can be shown that if `Compare()` is used, then we will mistakenly skip the next
block. Instead, we should use `CompareWithoutTimestamp()`.
The majority of this PR makes some existing tests in `db_with_timestamp_basic_test.cc`
parameterized so that different index types can be tested. A new unit test is
also added for more coverage.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8062
Test Plan: make check
Reviewed By: ltamasi
Differential Revision: D27057557
Pulled By: riversand963
fbshipit-source-id: c1062fa7c159ed600a1ad7e461531d52265021f1
2021-03-16 00:43:02 +00:00
|
|
|
bbto.index_type = GetParam();
|
2020-12-01 22:05:19 +00:00
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(bbto));
|
|
|
|
const size_t kTimestampSize = Timestamp(0, 0).size();
|
|
|
|
TestComparator test_cmp(kTimestampSize);
|
|
|
|
options.comparator = &test_cmp;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
WriteOptions write_opts;
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
std::string ts = Timestamp(1, 0);
|
2020-12-01 22:05:19 +00:00
|
|
|
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
ASSERT_OK(db_->Put(write_opts, "foo1", ts, "bar1"));
|
2020-12-01 22:05:19 +00:00
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
ASSERT_OK(db_->Put(write_opts, "foo2", ts, "bar2"));
|
2020-12-01 22:05:19 +00:00
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
// Move sst file to next level
|
|
|
|
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
|
|
|
|
|
2021-03-10 19:13:55 +00:00
|
|
|
for (int i = 3; i < 9; ++i) {
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
ASSERT_OK(db_->Put(write_opts, "foo" + std::to_string(i), ts,
|
2021-03-10 19:13:55 +00:00
|
|
|
"bar" + std::to_string(i)));
|
|
|
|
}
|
2020-12-01 22:05:19 +00:00
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
ReadOptions read_opts;
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
ts = Timestamp(2, 0);
|
|
|
|
Slice read_ts = ts;
|
|
|
|
read_opts.timestamp = &read_ts;
|
2021-03-10 19:13:55 +00:00
|
|
|
std::string up_bound = "foo5"; // exclusive
|
2020-12-01 22:05:19 +00:00
|
|
|
Slice up_bound_slice = up_bound;
|
2021-03-10 19:13:55 +00:00
|
|
|
std::string lo_bound = "foo2"; // inclusive
|
|
|
|
Slice lo_bound_slice = lo_bound;
|
2020-12-01 22:05:19 +00:00
|
|
|
read_opts.iterate_upper_bound = &up_bound_slice;
|
2021-03-10 19:13:55 +00:00
|
|
|
read_opts.iterate_lower_bound = &lo_bound_slice;
|
2020-12-01 22:05:19 +00:00
|
|
|
read_opts.auto_prefix_mode = true;
|
2020-11-10 22:51:38 +00:00
|
|
|
{
|
|
|
|
std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
|
|
|
|
// Make sure the prefix extractor doesn't include timestamp, otherwise it
|
|
|
|
// may return invalid result.
|
|
|
|
iter->Seek("foo");
|
2021-03-10 19:13:55 +00:00
|
|
|
CheckIterUserEntry(iter.get(), lo_bound, kTypeValue, "bar2",
|
|
|
|
Timestamp(1, 0));
|
|
|
|
iter->SeekToFirst();
|
|
|
|
CheckIterUserEntry(iter.get(), lo_bound, kTypeValue, "bar2",
|
|
|
|
Timestamp(1, 0));
|
|
|
|
iter->SeekForPrev("g");
|
|
|
|
CheckIterUserEntry(iter.get(), "foo4", kTypeValue, "bar4", Timestamp(1, 0));
|
|
|
|
iter->SeekToLast();
|
|
|
|
CheckIterUserEntry(iter.get(), "foo4", kTypeValue, "bar4", Timestamp(1, 0));
|
|
|
|
}
|
|
|
|
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBBasicTestWithTimestamp, ChangeIterationDirection) {
|
|
|
|
Options options = GetDefaultOptions();
|
|
|
|
options.create_if_missing = true;
|
|
|
|
options.env = env_;
|
|
|
|
const size_t kTimestampSize = Timestamp(0, 0).size();
|
|
|
|
TestComparator test_cmp(kTimestampSize);
|
|
|
|
options.comparator = &test_cmp;
|
|
|
|
options.prefix_extractor.reset(NewFixedPrefixTransform(1));
|
Steps toward deprecating implicit prefix seek, related fixes (#13026)
Summary:
With some new use cases onboarding to prefix extractors/seek/filters, one of the risks is existing iterator code, e.g. for maintenance tasks, being unintentionally subject to prefix seek semantics. This is a longstanding known design flaw with prefix seek, and `prefix_same_as_start` and `auto_prefix_mode` were steps in the direction of making that obsolete. However, we can't just immediately set `total_order_seek` to true by default, because that would impact so much code instantly.
Here we add a new DB option, `prefix_seek_opt_in_only` that basically allows users to transition to the future behavior when they are ready. When set to true, all iterators will be treated as if `total_order_seek=true` and then the only ways to get prefix seek semantics are with `prefix_same_as_start` or `auto_prefix_mode`.
Related fixes / changes:
* Make sure that `prefix_same_as_start` and `auto_prefix_mode` are compatible with (or override) `total_order_seek` (depending on your interpretation).
* Fix a bug in which a new iterator after dynamically changing the prefix extractor might mix different prefix semantics between memtable and SSTs. Both should use the latest extractor semantics, which means iterators ignoring memtable prefix filters with an old extractor. And that means passing the latest prefix extractor to new memtable iterators that might use prefix seek. (Without the fix, the test added for this fails in many ways.)
Suggested follow-up:
* Investigate a FIXME where a MergeIteratorBuilder is created in db_impl.cc. No unit test detects a change in value that should impact correctness.
* Make memtable prefix bloom compatible with `auto_prefix_mode`, which might require involving the memtablereps because we don't know at iterator creation time (only seek time) whether an auto_prefix_mode seek will be a prefix seek.
* Add `prefix_same_as_start` testing to db_stress
Pull Request resolved: https://github.com/facebook/rocksdb/pull/13026
Test Plan:
tests updated, added. Add combination of `total_order_seek=true` and `auto_prefix_mode=true` to stress test. Ran `make blackbox_crash_test` for a long while.
Manually ran tests with `prefix_seek_opt_in_only=true` as default, looking for unexpected issues. I inspected most of the results and migrated many tests to be ready for such a change (but not all).
Reviewed By: ltamasi
Differential Revision: D63147378
Pulled By: pdillinger
fbshipit-source-id: 1f4477b730683d43b4be7e933338583702d3c25e
2024-09-20 22:54:19 +00:00
|
|
|
options.prefix_seek_opt_in_only = false; // Use legacy prefix seek
|
2021-03-10 19:13:55 +00:00
|
|
|
options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
const std::vector<std::string> timestamps = {Timestamp(1, 1), Timestamp(0, 2),
|
|
|
|
Timestamp(4, 3)};
|
|
|
|
const std::vector<std::tuple<std::string, std::string>> kvs = {
|
|
|
|
std::make_tuple("aa", "value1"), std::make_tuple("ab", "value2")};
|
|
|
|
for (const auto& ts : timestamps) {
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
WriteBatch wb(0, 0, 0, kTimestampSize);
|
2021-03-10 19:13:55 +00:00
|
|
|
for (const auto& kv : kvs) {
|
|
|
|
const std::string& key = std::get<0>(kv);
|
|
|
|
const std::string& value = std::get<1>(kv);
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
ASSERT_OK(wb.Put(key, value));
|
2021-03-10 19:13:55 +00:00
|
|
|
}
|
|
|
|
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
ASSERT_OK(wb.UpdateTimestamps(
|
|
|
|
ts, [kTimestampSize](uint32_t) { return kTimestampSize; }));
|
2021-03-10 19:13:55 +00:00
|
|
|
ASSERT_OK(db_->Write(WriteOptions(), &wb));
|
2020-11-10 22:51:38 +00:00
|
|
|
}
|
2021-03-10 19:13:55 +00:00
|
|
|
std::string read_ts_str = Timestamp(5, 3);
|
|
|
|
Slice read_ts = read_ts_str;
|
|
|
|
ReadOptions read_opts;
|
|
|
|
read_opts.timestamp = &read_ts;
|
|
|
|
std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
|
|
|
|
|
|
|
|
it->SeekToFirst();
|
|
|
|
ASSERT_TRUE(it->Valid());
|
|
|
|
it->Prev();
|
|
|
|
ASSERT_FALSE(it->Valid());
|
2020-11-10 22:51:38 +00:00
|
|
|
|
2021-03-10 19:13:55 +00:00
|
|
|
it->SeekToLast();
|
|
|
|
ASSERT_TRUE(it->Valid());
|
|
|
|
uint64_t prev_reseek_count =
|
|
|
|
options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION);
|
|
|
|
ASSERT_EQ(0, prev_reseek_count);
|
|
|
|
it->Next();
|
|
|
|
ASSERT_FALSE(it->Valid());
|
|
|
|
ASSERT_EQ(1 + prev_reseek_count,
|
|
|
|
options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION));
|
|
|
|
|
|
|
|
it->Seek(std::get<0>(kvs[0]));
|
|
|
|
CheckIterUserEntry(it.get(), std::get<0>(kvs[0]), kTypeValue,
|
|
|
|
std::get<1>(kvs[0]), Timestamp(4, 3));
|
|
|
|
it->Next();
|
|
|
|
CheckIterUserEntry(it.get(), std::get<0>(kvs[1]), kTypeValue,
|
|
|
|
std::get<1>(kvs[1]), Timestamp(4, 3));
|
|
|
|
it->Prev();
|
|
|
|
CheckIterUserEntry(it.get(), std::get<0>(kvs[0]), kTypeValue,
|
|
|
|
std::get<1>(kvs[0]), Timestamp(4, 3));
|
|
|
|
|
|
|
|
prev_reseek_count =
|
|
|
|
options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION);
|
|
|
|
ASSERT_EQ(1, prev_reseek_count);
|
|
|
|
it->Next();
|
|
|
|
CheckIterUserEntry(it.get(), std::get<0>(kvs[1]), kTypeValue,
|
|
|
|
std::get<1>(kvs[1]), Timestamp(4, 3));
|
|
|
|
ASSERT_EQ(1 + prev_reseek_count,
|
|
|
|
options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION));
|
|
|
|
|
|
|
|
it->SeekForPrev(std::get<0>(kvs[1]));
|
|
|
|
CheckIterUserEntry(it.get(), std::get<0>(kvs[1]), kTypeValue,
|
|
|
|
std::get<1>(kvs[1]), Timestamp(4, 3));
|
|
|
|
it->Prev();
|
|
|
|
CheckIterUserEntry(it.get(), std::get<0>(kvs[0]), kTypeValue,
|
|
|
|
std::get<1>(kvs[0]), Timestamp(4, 3));
|
|
|
|
|
|
|
|
prev_reseek_count =
|
|
|
|
options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION);
|
|
|
|
it->Next();
|
|
|
|
CheckIterUserEntry(it.get(), std::get<0>(kvs[1]), kTypeValue,
|
|
|
|
std::get<1>(kvs[1]), Timestamp(4, 3));
|
|
|
|
ASSERT_EQ(1 + prev_reseek_count,
|
|
|
|
options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION));
|
|
|
|
|
|
|
|
it.reset();
|
2020-11-10 22:51:38 +00:00
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
2020-04-10 16:49:38 +00:00
|
|
|
TEST_F(DBBasicTestWithTimestamp, SimpleForwardIterateLowerTsBound) {
|
2020-08-05 01:50:05 +00:00
|
|
|
constexpr int kNumKeysPerFile = 128;
|
|
|
|
constexpr uint64_t kMaxKey = 1024;
|
2020-04-10 16:49:38 +00:00
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.env = env_;
|
|
|
|
options.create_if_missing = true;
|
|
|
|
const size_t kTimestampSize = Timestamp(0, 0).size();
|
|
|
|
TestComparator test_cmp(kTimestampSize);
|
|
|
|
options.comparator = &test_cmp;
|
2021-09-08 14:45:59 +00:00
|
|
|
options.memtable_factory.reset(
|
|
|
|
test::NewSpecialSkipListFactory(kNumKeysPerFile));
|
2020-04-10 16:49:38 +00:00
|
|
|
DestroyAndReopen(options);
|
|
|
|
const std::vector<std::string> write_timestamps = {Timestamp(1, 0),
|
|
|
|
Timestamp(3, 0)};
|
|
|
|
const std::vector<std::string> read_timestamps = {Timestamp(2, 0),
|
|
|
|
Timestamp(4, 0)};
|
|
|
|
const std::vector<std::string> read_timestamps_lb = {Timestamp(1, 0),
|
|
|
|
Timestamp(1, 0)};
|
|
|
|
for (size_t i = 0; i < write_timestamps.size(); ++i) {
|
|
|
|
WriteOptions write_opts;
|
|
|
|
for (uint64_t key = 0; key <= kMaxKey; ++key) {
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
Status s = db_->Put(write_opts, Key1(key), write_timestamps[i],
|
|
|
|
"value" + std::to_string(i));
|
2020-04-10 16:49:38 +00:00
|
|
|
ASSERT_OK(s);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
for (size_t i = 0; i < read_timestamps.size(); ++i) {
|
|
|
|
ReadOptions read_opts;
|
|
|
|
Slice read_ts = read_timestamps[i];
|
|
|
|
Slice read_ts_lb = read_timestamps_lb[i];
|
|
|
|
read_opts.timestamp = &read_ts;
|
|
|
|
read_opts.iter_start_ts = &read_ts_lb;
|
|
|
|
std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
|
|
|
|
int count = 0;
|
|
|
|
uint64_t key = 0;
|
|
|
|
for (it->Seek(Key1(0)), key = 0; it->Valid(); it->Next(), ++count, ++key) {
|
2020-08-05 01:50:05 +00:00
|
|
|
CheckIterEntry(it.get(), Key1(key), kTypeValue,
|
|
|
|
"value" + std::to_string(i), write_timestamps[i]);
|
2020-04-10 16:49:38 +00:00
|
|
|
if (i > 0) {
|
|
|
|
it->Next();
|
2020-08-05 01:50:05 +00:00
|
|
|
CheckIterEntry(it.get(), Key1(key), kTypeValue,
|
|
|
|
"value" + std::to_string(i - 1),
|
|
|
|
write_timestamps[i - 1]);
|
2020-04-10 16:49:38 +00:00
|
|
|
}
|
|
|
|
}
|
2023-10-18 16:38:38 +00:00
|
|
|
ASSERT_OK(it->status());
|
2020-04-10 16:49:38 +00:00
|
|
|
size_t expected_count = kMaxKey + 1;
|
|
|
|
ASSERT_EQ(expected_count, count);
|
|
|
|
}
|
2020-08-05 01:50:05 +00:00
|
|
|
// Delete all keys@ts=5 and check iteration result with start ts set
|
|
|
|
{
|
|
|
|
std::string write_timestamp = Timestamp(5, 0);
|
|
|
|
WriteOptions write_opts;
|
|
|
|
for (uint64_t key = 0; key < kMaxKey + 1; ++key) {
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
Status s = db_->Delete(write_opts, Key1(key), write_timestamp);
|
2020-08-05 01:50:05 +00:00
|
|
|
ASSERT_OK(s);
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string read_timestamp = Timestamp(6, 0);
|
|
|
|
ReadOptions read_opts;
|
|
|
|
Slice read_ts = read_timestamp;
|
|
|
|
read_opts.timestamp = &read_ts;
|
|
|
|
std::string read_timestamp_lb = Timestamp(2, 0);
|
|
|
|
Slice read_ts_lb = read_timestamp_lb;
|
|
|
|
read_opts.iter_start_ts = &read_ts_lb;
|
|
|
|
std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
|
|
|
|
int count = 0;
|
|
|
|
uint64_t key = 0;
|
|
|
|
for (it->Seek(Key1(0)), key = 0; it->Valid(); it->Next(), ++count, ++key) {
|
|
|
|
CheckIterEntry(it.get(), Key1(key), kTypeDeletionWithTimestamp, Slice(),
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
write_timestamp);
|
2020-08-05 01:50:05 +00:00
|
|
|
// Skip key@ts=3 and land on tombstone key@ts=5
|
|
|
|
it->Next();
|
|
|
|
}
|
|
|
|
ASSERT_EQ(kMaxKey + 1, count);
|
|
|
|
}
|
2020-04-10 16:49:38 +00:00
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
2022-06-29 02:51:05 +00:00
|
|
|
TEST_F(DBBasicTestWithTimestamp, BackwardIterateLowerTsBound) {
|
|
|
|
constexpr int kNumKeysPerFile = 128;
|
|
|
|
constexpr uint64_t kMaxKey = 1024;
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.env = env_;
|
|
|
|
options.create_if_missing = true;
|
|
|
|
const size_t kTimestampSize = Timestamp(0, 0).size();
|
|
|
|
TestComparator test_cmp(kTimestampSize);
|
|
|
|
options.comparator = &test_cmp;
|
|
|
|
options.memtable_factory.reset(
|
|
|
|
test::NewSpecialSkipListFactory(kNumKeysPerFile));
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
const std::vector<std::string> write_timestamps = {Timestamp(1, 0),
|
|
|
|
Timestamp(3, 0)};
|
|
|
|
const std::vector<std::string> read_timestamps = {Timestamp(2, 0),
|
|
|
|
Timestamp(4, 0)};
|
|
|
|
const std::vector<std::string> read_timestamps_lb = {Timestamp(1, 0),
|
|
|
|
Timestamp(1, 0)};
|
|
|
|
for (size_t i = 0; i < write_timestamps.size(); ++i) {
|
|
|
|
WriteOptions write_opts;
|
|
|
|
for (uint64_t key = 0; key <= kMaxKey; ++key) {
|
|
|
|
Status s = db_->Put(write_opts, Key1(key), write_timestamps[i],
|
|
|
|
"value" + std::to_string(i));
|
|
|
|
ASSERT_OK(s);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
for (size_t i = 0; i < read_timestamps.size(); ++i) {
|
|
|
|
ReadOptions read_opts;
|
|
|
|
Slice read_ts = read_timestamps[i];
|
|
|
|
Slice read_ts_lb = read_timestamps_lb[i];
|
|
|
|
read_opts.timestamp = &read_ts;
|
|
|
|
read_opts.iter_start_ts = &read_ts_lb;
|
|
|
|
std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
|
|
|
|
int count = 0;
|
|
|
|
uint64_t key = 0;
|
|
|
|
for (it->SeekForPrev(Key1(kMaxKey)), key = kMaxKey; it->Valid();
|
|
|
|
it->Prev(), ++count, --key) {
|
|
|
|
CheckIterEntry(it.get(), Key1(key), kTypeValue, "value0",
|
|
|
|
write_timestamps[0]);
|
|
|
|
if (i > 0) {
|
|
|
|
it->Prev();
|
|
|
|
CheckIterEntry(it.get(), Key1(key), kTypeValue, "value1",
|
|
|
|
write_timestamps[1]);
|
|
|
|
}
|
|
|
|
}
|
2023-10-18 16:38:38 +00:00
|
|
|
ASSERT_OK(it->status());
|
2022-06-29 02:51:05 +00:00
|
|
|
size_t expected_count = kMaxKey + 1;
|
|
|
|
ASSERT_EQ(expected_count, count);
|
|
|
|
}
|
|
|
|
// Delete all keys@ts=5 and check iteration result with start ts set
|
|
|
|
{
|
|
|
|
std::string write_timestamp = Timestamp(5, 0);
|
|
|
|
WriteOptions write_opts;
|
|
|
|
for (uint64_t key = 0; key < kMaxKey + 1; ++key) {
|
|
|
|
Status s = db_->Delete(write_opts, Key1(key), write_timestamp);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string read_timestamp = Timestamp(6, 0);
|
|
|
|
ReadOptions read_opts;
|
|
|
|
Slice read_ts = read_timestamp;
|
|
|
|
read_opts.timestamp = &read_ts;
|
|
|
|
std::string read_timestamp_lb = Timestamp(2, 0);
|
|
|
|
Slice read_ts_lb = read_timestamp_lb;
|
|
|
|
read_opts.iter_start_ts = &read_ts_lb;
|
|
|
|
std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
|
|
|
|
int count = 0;
|
|
|
|
uint64_t key = kMaxKey;
|
|
|
|
for (it->SeekForPrev(Key1(key)), key = kMaxKey; it->Valid();
|
|
|
|
it->Prev(), ++count, --key) {
|
|
|
|
CheckIterEntry(it.get(), Key1(key), kTypeValue, "value1",
|
|
|
|
Timestamp(3, 0));
|
|
|
|
it->Prev();
|
|
|
|
CheckIterEntry(it.get(), Key1(key), kTypeDeletionWithTimestamp, Slice(),
|
|
|
|
write_timestamp);
|
|
|
|
}
|
2023-10-18 16:38:38 +00:00
|
|
|
ASSERT_OK(it->status());
|
2022-06-29 02:51:05 +00:00
|
|
|
ASSERT_EQ(kMaxKey + 1, count);
|
|
|
|
}
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBBasicTestWithTimestamp, SimpleBackwardIterateLowerTsBound) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.env = env_;
|
|
|
|
options.create_if_missing = true;
|
|
|
|
const size_t kTimestampSize = Timestamp(0, 0).size();
|
|
|
|
TestComparator test_cmp(kTimestampSize);
|
|
|
|
options.comparator = &test_cmp;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
std::string ts_ub_buf = Timestamp(5, 0);
|
|
|
|
Slice ts_ub = ts_ub_buf;
|
|
|
|
std::string ts_lb_buf = Timestamp(1, 0);
|
|
|
|
Slice ts_lb = ts_lb_buf;
|
|
|
|
|
|
|
|
{
|
|
|
|
ReadOptions read_opts;
|
|
|
|
read_opts.timestamp = &ts_ub;
|
|
|
|
read_opts.iter_start_ts = &ts_lb;
|
|
|
|
std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
|
|
|
|
it->SeekToLast();
|
|
|
|
ASSERT_FALSE(it->Valid());
|
|
|
|
ASSERT_OK(it->status());
|
|
|
|
|
|
|
|
it->SeekForPrev("foo");
|
|
|
|
ASSERT_FALSE(it->Valid());
|
|
|
|
ASSERT_OK(it->status());
|
|
|
|
}
|
|
|
|
|
|
|
|
// Test iterate_upper_bound
|
|
|
|
ASSERT_OK(db_->Put(WriteOptions(), "a", Timestamp(0, 0), "v0"));
|
|
|
|
ASSERT_OK(db_->SingleDelete(WriteOptions(), "a", Timestamp(1, 0)));
|
|
|
|
|
|
|
|
for (int i = 0; i < 5; ++i) {
|
|
|
|
ASSERT_OK(db_->Put(WriteOptions(), "b", Timestamp(i, 0),
|
|
|
|
"v" + std::to_string(i)));
|
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
ReadOptions read_opts;
|
|
|
|
read_opts.timestamp = &ts_ub;
|
|
|
|
read_opts.iter_start_ts = &ts_lb;
|
|
|
|
std::string key_ub_str = "b"; // exclusive
|
|
|
|
Slice key_ub = key_ub_str;
|
|
|
|
read_opts.iterate_upper_bound = &key_ub;
|
|
|
|
std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
|
|
|
|
it->SeekToLast();
|
|
|
|
CheckIterEntry(it.get(), "a", kTypeSingleDeletion, Slice(),
|
|
|
|
Timestamp(1, 0));
|
2022-06-30 17:16:03 +00:00
|
|
|
|
|
|
|
key_ub_str = "a"; // exclusive
|
|
|
|
key_ub = key_ub_str;
|
|
|
|
read_opts.iterate_upper_bound = &key_ub;
|
|
|
|
it.reset(db_->NewIterator(read_opts));
|
|
|
|
it->SeekToLast();
|
|
|
|
ASSERT_FALSE(it->Valid());
|
|
|
|
ASSERT_OK(it->status());
|
2022-06-29 02:51:05 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBBasicTestWithTimestamp, BackwardIterateLowerTsBound_Reseek) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.env = env_;
|
|
|
|
options.create_if_missing = true;
|
|
|
|
options.max_sequential_skip_in_iterations = 2;
|
|
|
|
const size_t kTimestampSize = Timestamp(0, 0).size();
|
|
|
|
TestComparator test_cmp(kTimestampSize);
|
|
|
|
options.comparator = &test_cmp;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
for (int i = 0; i < 10; ++i) {
|
|
|
|
ASSERT_OK(db_->Put(WriteOptions(), "a", Timestamp(i, 0),
|
|
|
|
"v" + std::to_string(i)));
|
|
|
|
}
|
|
|
|
|
|
|
|
for (int i = 0; i < 10; ++i) {
|
|
|
|
ASSERT_OK(db_->Put(WriteOptions(), "b", Timestamp(i, 0),
|
|
|
|
"v" + std::to_string(i)));
|
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
std::string ts_ub_buf = Timestamp(6, 0);
|
|
|
|
Slice ts_ub = ts_ub_buf;
|
|
|
|
std::string ts_lb_buf = Timestamp(4, 0);
|
|
|
|
Slice ts_lb = ts_lb_buf;
|
|
|
|
|
|
|
|
ReadOptions read_opts;
|
|
|
|
read_opts.timestamp = &ts_ub;
|
|
|
|
read_opts.iter_start_ts = &ts_lb;
|
|
|
|
std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
|
|
|
|
it->SeekToLast();
|
|
|
|
for (int i = 0; i < 3 && it->Valid(); it->Prev(), ++i) {
|
|
|
|
CheckIterEntry(it.get(), "b", kTypeValue, "v" + std::to_string(4 + i),
|
|
|
|
Timestamp(4 + i, 0));
|
|
|
|
}
|
|
|
|
for (int i = 0; i < 3 && it->Valid(); it->Prev(), ++i) {
|
|
|
|
CheckIterEntry(it.get(), "a", kTypeValue, "v" + std::to_string(4 + i),
|
|
|
|
Timestamp(4 + i, 0));
|
|
|
|
}
|
2023-10-18 16:38:38 +00:00
|
|
|
ASSERT_OK(it->status());
|
2022-06-29 02:51:05 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
2020-03-13 18:33:04 +00:00
|
|
|
TEST_F(DBBasicTestWithTimestamp, ReseekToTargetTimestamp) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.env = env_;
|
|
|
|
options.create_if_missing = true;
|
|
|
|
constexpr size_t kNumKeys = 16;
|
|
|
|
options.max_sequential_skip_in_iterations = kNumKeys / 2;
|
|
|
|
options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
|
|
|
|
const size_t kTimestampSize = Timestamp(0, 0).size();
|
|
|
|
TestComparator test_cmp(kTimestampSize);
|
|
|
|
options.comparator = &test_cmp;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
// Insert kNumKeys
|
|
|
|
WriteOptions write_opts;
|
|
|
|
Status s;
|
|
|
|
for (size_t i = 0; i != kNumKeys; ++i) {
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
std::string ts = Timestamp(static_cast<uint64_t>(i + 1), 0);
|
|
|
|
s = db_->Put(write_opts, "foo", ts, "value" + std::to_string(i));
|
2020-03-13 18:33:04 +00:00
|
|
|
ASSERT_OK(s);
|
|
|
|
}
|
|
|
|
{
|
|
|
|
ReadOptions read_opts;
|
|
|
|
std::string ts_str = Timestamp(1, 0);
|
|
|
|
Slice ts = ts_str;
|
|
|
|
read_opts.timestamp = &ts;
|
|
|
|
std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
|
|
|
|
iter->SeekToFirst();
|
2020-08-05 01:50:05 +00:00
|
|
|
CheckIterUserEntry(iter.get(), "foo", kTypeValue, "value0", ts_str);
|
2020-03-13 18:33:04 +00:00
|
|
|
ASSERT_EQ(
|
|
|
|
1, options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION));
|
2021-03-10 19:13:55 +00:00
|
|
|
|
|
|
|
ts_str = Timestamp(kNumKeys, 0);
|
|
|
|
ts = ts_str;
|
|
|
|
read_opts.timestamp = &ts;
|
|
|
|
iter.reset(db_->NewIterator(read_opts));
|
|
|
|
iter->SeekToLast();
|
|
|
|
CheckIterUserEntry(iter.get(), "foo", kTypeValue,
|
|
|
|
"value" + std::to_string(kNumKeys - 1), ts_str);
|
|
|
|
ASSERT_EQ(
|
|
|
|
2, options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION));
|
2020-03-13 18:33:04 +00:00
|
|
|
}
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBBasicTestWithTimestamp, ReseekToNextUserKey) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.env = env_;
|
|
|
|
options.create_if_missing = true;
|
|
|
|
constexpr size_t kNumKeys = 16;
|
|
|
|
options.max_sequential_skip_in_iterations = kNumKeys / 2;
|
|
|
|
options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
|
|
|
|
const size_t kTimestampSize = Timestamp(0, 0).size();
|
|
|
|
TestComparator test_cmp(kTimestampSize);
|
|
|
|
options.comparator = &test_cmp;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
// Write kNumKeys + 1 keys
|
|
|
|
WriteOptions write_opts;
|
|
|
|
Status s;
|
|
|
|
for (size_t i = 0; i != kNumKeys; ++i) {
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
std::string ts = Timestamp(static_cast<uint64_t>(i + 1), 0);
|
|
|
|
s = db_->Put(write_opts, "a", ts, "value" + std::to_string(i));
|
2020-03-13 18:33:04 +00:00
|
|
|
ASSERT_OK(s);
|
|
|
|
}
|
|
|
|
{
|
|
|
|
std::string ts_str = Timestamp(static_cast<uint64_t>(kNumKeys + 1), 0);
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
WriteBatch batch(0, 0, 0, kTimestampSize);
|
|
|
|
{ ASSERT_OK(batch.Put("a", "new_value")); }
|
|
|
|
{ ASSERT_OK(batch.Put("b", "new_value")); }
|
|
|
|
s = batch.UpdateTimestamps(
|
|
|
|
ts_str, [kTimestampSize](uint32_t) { return kTimestampSize; });
|
2020-03-13 18:33:04 +00:00
|
|
|
ASSERT_OK(s);
|
|
|
|
s = db_->Write(write_opts, &batch);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
}
|
|
|
|
{
|
|
|
|
ReadOptions read_opts;
|
|
|
|
std::string ts_str = Timestamp(static_cast<uint64_t>(kNumKeys + 1), 0);
|
|
|
|
Slice ts = ts_str;
|
|
|
|
read_opts.timestamp = &ts;
|
|
|
|
std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
|
|
|
|
iter->Seek("a");
|
|
|
|
iter->Next();
|
2020-08-05 01:50:05 +00:00
|
|
|
CheckIterUserEntry(iter.get(), "b", kTypeValue, "new_value", ts_str);
|
2020-03-13 18:33:04 +00:00
|
|
|
ASSERT_EQ(
|
|
|
|
1, options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION));
|
|
|
|
}
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
2021-03-10 19:13:55 +00:00
|
|
|
TEST_F(DBBasicTestWithTimestamp, ReseekToUserKeyBeforeSavedKey) {
|
|
|
|
Options options = GetDefaultOptions();
|
|
|
|
options.env = env_;
|
|
|
|
options.create_if_missing = true;
|
|
|
|
constexpr size_t kNumKeys = 16;
|
|
|
|
options.max_sequential_skip_in_iterations = kNumKeys / 2;
|
|
|
|
options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
|
|
|
|
const size_t kTimestampSize = Timestamp(0, 0).size();
|
|
|
|
TestComparator test_cmp(kTimestampSize);
|
|
|
|
options.comparator = &test_cmp;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
for (size_t i = 0; i < kNumKeys; ++i) {
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
std::string ts = Timestamp(static_cast<uint64_t>(i + 1), 0);
|
2021-03-10 19:13:55 +00:00
|
|
|
WriteOptions write_opts;
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
Status s = db_->Put(write_opts, "b", ts, "value" + std::to_string(i));
|
2021-03-10 19:13:55 +00:00
|
|
|
ASSERT_OK(s);
|
|
|
|
}
|
|
|
|
{
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
std::string ts = Timestamp(1, 0);
|
2021-03-10 19:13:55 +00:00
|
|
|
WriteOptions write_opts;
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
ASSERT_OK(db_->Put(write_opts, "a", ts, "value"));
|
2021-03-10 19:13:55 +00:00
|
|
|
}
|
|
|
|
{
|
|
|
|
ReadOptions read_opts;
|
|
|
|
std::string ts_str = Timestamp(1, 0);
|
|
|
|
Slice ts = ts_str;
|
|
|
|
read_opts.timestamp = &ts;
|
|
|
|
std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
|
|
|
|
iter->SeekToLast();
|
|
|
|
iter->Prev();
|
|
|
|
CheckIterUserEntry(iter.get(), "a", kTypeValue, "value", ts_str);
|
|
|
|
ASSERT_EQ(
|
|
|
|
1, options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION));
|
|
|
|
}
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
2020-11-03 17:44:21 +00:00
|
|
|
TEST_F(DBBasicTestWithTimestamp, MultiGetWithFastLocalBloom) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.env = env_;
|
|
|
|
options.create_if_missing = true;
|
|
|
|
BlockBasedTableOptions bbto;
|
|
|
|
bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
|
|
|
|
bbto.cache_index_and_filter_blocks = true;
|
|
|
|
bbto.whole_key_filtering = true;
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(bbto));
|
|
|
|
const size_t kTimestampSize = Timestamp(0, 0).size();
|
|
|
|
TestComparator test_cmp(kTimestampSize);
|
|
|
|
options.comparator = &test_cmp;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
// Write any value
|
|
|
|
WriteOptions write_opts;
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
std::string ts = Timestamp(1, 0);
|
2020-11-03 17:44:21 +00:00
|
|
|
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
ASSERT_OK(db_->Put(write_opts, "foo", ts, "bar"));
|
2020-11-03 17:44:21 +00:00
|
|
|
|
2020-12-01 22:05:19 +00:00
|
|
|
ASSERT_OK(Flush());
|
2020-11-03 17:44:21 +00:00
|
|
|
|
|
|
|
// Read with MultiGet
|
|
|
|
ReadOptions read_opts;
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
Slice read_ts = ts;
|
|
|
|
read_opts.timestamp = &read_ts;
|
2020-11-03 17:44:21 +00:00
|
|
|
size_t batch_size = 1;
|
|
|
|
std::vector<Slice> keys(batch_size);
|
|
|
|
std::vector<PinnableSlice> values(batch_size);
|
|
|
|
std::vector<Status> statuses(batch_size);
|
2022-06-04 03:00:42 +00:00
|
|
|
std::vector<std::string> timestamps(batch_size);
|
2020-11-03 17:44:21 +00:00
|
|
|
keys[0] = "foo";
|
|
|
|
ColumnFamilyHandle* cfh = db_->DefaultColumnFamily();
|
|
|
|
db_->MultiGet(read_opts, cfh, batch_size, keys.data(), values.data(),
|
2022-06-04 03:00:42 +00:00
|
|
|
timestamps.data(), statuses.data(), true);
|
2020-11-03 17:44:21 +00:00
|
|
|
|
|
|
|
ASSERT_OK(statuses[0]);
|
2022-06-04 03:00:42 +00:00
|
|
|
ASSERT_EQ(Timestamp(1, 0), timestamps[0]);
|
|
|
|
for (auto& elem : values) {
|
|
|
|
elem.Reset();
|
|
|
|
}
|
|
|
|
|
|
|
|
ASSERT_OK(db_->SingleDelete(WriteOptions(), "foo", Timestamp(2, 0)));
|
|
|
|
ts = Timestamp(3, 0);
|
|
|
|
read_ts = ts;
|
|
|
|
read_opts.timestamp = &read_ts;
|
|
|
|
db_->MultiGet(read_opts, cfh, batch_size, keys.data(), values.data(),
|
|
|
|
timestamps.data(), statuses.data(), true);
|
|
|
|
ASSERT_TRUE(statuses[0].IsNotFound());
|
|
|
|
ASSERT_EQ(Timestamp(2, 0), timestamps[0]);
|
|
|
|
|
2020-11-03 17:44:21 +00:00
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
Fix a bug in key comparison when index type is kBinarySearchWithFirstKey (#8062)
Summary:
When timestamp is enabled, key comparison should take this into account.
In `BlockBasedTableReader::Get()`, `BlockBasedTableReader::MultiGet()`,
assume the target key is `key`, and the timestamp upper bound is `ts`.
The highest key in current block is (key, ts1), while the lowest key in next
block is (key, ts2).
If
```
ts1 > ts > ts2
```
then
```
(key, ts1) < (key, ts) < (key, ts2)
```
It can be shown that if `Compare()` is used, then we will mistakenly skip the next
block. Instead, we should use `CompareWithoutTimestamp()`.
The majority of this PR makes some existing tests in `db_with_timestamp_basic_test.cc`
parameterized so that different index types can be tested. A new unit test is
also added for more coverage.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8062
Test Plan: make check
Reviewed By: ltamasi
Differential Revision: D27057557
Pulled By: riversand963
fbshipit-source-id: c1062fa7c159ed600a1ad7e461531d52265021f1
2021-03-16 00:43:02 +00:00
|
|
|
TEST_P(DBBasicTestWithTimestampTableOptions, MultiGetWithPrefix) {
|
2020-11-03 17:44:21 +00:00
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.env = env_;
|
|
|
|
options.create_if_missing = true;
|
|
|
|
options.prefix_extractor.reset(NewCappedPrefixTransform(5));
|
|
|
|
BlockBasedTableOptions bbto;
|
|
|
|
bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
|
|
|
|
bbto.cache_index_and_filter_blocks = true;
|
|
|
|
bbto.whole_key_filtering = false;
|
Fix a bug in key comparison when index type is kBinarySearchWithFirstKey (#8062)
Summary:
When timestamp is enabled, key comparison should take this into account.
In `BlockBasedTableReader::Get()`, `BlockBasedTableReader::MultiGet()`,
assume the target key is `key`, and the timestamp upper bound is `ts`.
The highest key in current block is (key, ts1), while the lowest key in next
block is (key, ts2).
If
```
ts1 > ts > ts2
```
then
```
(key, ts1) < (key, ts) < (key, ts2)
```
It can be shown that if `Compare()` is used, then we will mistakenly skip the next
block. Instead, we should use `CompareWithoutTimestamp()`.
The majority of this PR makes some existing tests in `db_with_timestamp_basic_test.cc`
parameterized so that different index types can be tested. A new unit test is
also added for more coverage.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8062
Test Plan: make check
Reviewed By: ltamasi
Differential Revision: D27057557
Pulled By: riversand963
fbshipit-source-id: c1062fa7c159ed600a1ad7e461531d52265021f1
2021-03-16 00:43:02 +00:00
|
|
|
bbto.index_type = GetParam();
|
2020-11-03 17:44:21 +00:00
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(bbto));
|
|
|
|
const size_t kTimestampSize = Timestamp(0, 0).size();
|
|
|
|
TestComparator test_cmp(kTimestampSize);
|
|
|
|
options.comparator = &test_cmp;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
// Write any value
|
|
|
|
WriteOptions write_opts;
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
std::string ts = Timestamp(1, 0);
|
2020-11-03 17:44:21 +00:00
|
|
|
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
ASSERT_OK(db_->Put(write_opts, "foo", ts, "bar"));
|
2020-11-03 17:44:21 +00:00
|
|
|
|
2020-12-01 22:05:19 +00:00
|
|
|
ASSERT_OK(Flush());
|
2020-11-03 17:44:21 +00:00
|
|
|
|
|
|
|
// Read with MultiGet
|
|
|
|
ReadOptions read_opts;
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
Slice read_ts = ts;
|
|
|
|
read_opts.timestamp = &read_ts;
|
2020-11-03 17:44:21 +00:00
|
|
|
size_t batch_size = 1;
|
|
|
|
std::vector<Slice> keys(batch_size);
|
|
|
|
std::vector<PinnableSlice> values(batch_size);
|
|
|
|
std::vector<Status> statuses(batch_size);
|
2022-06-04 03:00:42 +00:00
|
|
|
std::vector<std::string> timestamps(batch_size);
|
2020-11-03 17:44:21 +00:00
|
|
|
keys[0] = "foo";
|
|
|
|
ColumnFamilyHandle* cfh = db_->DefaultColumnFamily();
|
|
|
|
db_->MultiGet(read_opts, cfh, batch_size, keys.data(), values.data(),
|
2022-06-04 03:00:42 +00:00
|
|
|
timestamps.data(), statuses.data(), true);
|
2020-11-03 17:44:21 +00:00
|
|
|
|
|
|
|
ASSERT_OK(statuses[0]);
|
2022-06-04 03:00:42 +00:00
|
|
|
ASSERT_EQ(Timestamp(1, 0), timestamps[0]);
|
|
|
|
for (auto& elem : values) {
|
|
|
|
elem.Reset();
|
|
|
|
}
|
|
|
|
|
|
|
|
ASSERT_OK(db_->SingleDelete(WriteOptions(), "foo", Timestamp(2, 0)));
|
|
|
|
// TODO re-enable after fixing a bug of kHashSearch
|
|
|
|
if (GetParam() != BlockBasedTableOptions::IndexType::kHashSearch) {
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
|
|
|
|
|
|
|
ts = Timestamp(3, 0);
|
|
|
|
read_ts = ts;
|
|
|
|
db_->MultiGet(read_opts, cfh, batch_size, keys.data(), values.data(),
|
|
|
|
timestamps.data(), statuses.data(), true);
|
|
|
|
ASSERT_TRUE(statuses[0].IsNotFound());
|
|
|
|
ASSERT_EQ(Timestamp(2, 0), timestamps[0]);
|
|
|
|
|
2020-11-03 17:44:21 +00:00
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
Fix a bug in key comparison when index type is kBinarySearchWithFirstKey (#8062)
Summary:
When timestamp is enabled, key comparison should take this into account.
In `BlockBasedTableReader::Get()`, `BlockBasedTableReader::MultiGet()`,
assume the target key is `key`, and the timestamp upper bound is `ts`.
The highest key in current block is (key, ts1), while the lowest key in next
block is (key, ts2).
If
```
ts1 > ts > ts2
```
then
```
(key, ts1) < (key, ts) < (key, ts2)
```
It can be shown that if `Compare()` is used, then we will mistakenly skip the next
block. Instead, we should use `CompareWithoutTimestamp()`.
The majority of this PR makes some existing tests in `db_with_timestamp_basic_test.cc`
parameterized so that different index types can be tested. A new unit test is
also added for more coverage.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8062
Test Plan: make check
Reviewed By: ltamasi
Differential Revision: D27057557
Pulled By: riversand963
fbshipit-source-id: c1062fa7c159ed600a1ad7e461531d52265021f1
2021-03-16 00:43:02 +00:00
|
|
|
TEST_P(DBBasicTestWithTimestampTableOptions, MultiGetWithMemBloomFilter) {
|
2020-11-03 17:44:21 +00:00
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.env = env_;
|
|
|
|
options.create_if_missing = true;
|
|
|
|
options.prefix_extractor.reset(NewCappedPrefixTransform(5));
|
|
|
|
BlockBasedTableOptions bbto;
|
|
|
|
bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
|
|
|
|
bbto.cache_index_and_filter_blocks = true;
|
|
|
|
bbto.whole_key_filtering = false;
|
Fix a bug in key comparison when index type is kBinarySearchWithFirstKey (#8062)
Summary:
When timestamp is enabled, key comparison should take this into account.
In `BlockBasedTableReader::Get()`, `BlockBasedTableReader::MultiGet()`,
assume the target key is `key`, and the timestamp upper bound is `ts`.
The highest key in current block is (key, ts1), while the lowest key in next
block is (key, ts2).
If
```
ts1 > ts > ts2
```
then
```
(key, ts1) < (key, ts) < (key, ts2)
```
It can be shown that if `Compare()` is used, then we will mistakenly skip the next
block. Instead, we should use `CompareWithoutTimestamp()`.
The majority of this PR makes some existing tests in `db_with_timestamp_basic_test.cc`
parameterized so that different index types can be tested. A new unit test is
also added for more coverage.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8062
Test Plan: make check
Reviewed By: ltamasi
Differential Revision: D27057557
Pulled By: riversand963
fbshipit-source-id: c1062fa7c159ed600a1ad7e461531d52265021f1
2021-03-16 00:43:02 +00:00
|
|
|
bbto.index_type = GetParam();
|
2020-11-03 17:44:21 +00:00
|
|
|
options.memtable_prefix_bloom_size_ratio = 0.1;
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(bbto));
|
|
|
|
const size_t kTimestampSize = Timestamp(0, 0).size();
|
|
|
|
TestComparator test_cmp(kTimestampSize);
|
|
|
|
options.comparator = &test_cmp;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
// Write any value
|
|
|
|
WriteOptions write_opts;
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
std::string ts = Timestamp(1, 0);
|
2020-11-03 17:44:21 +00:00
|
|
|
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
ASSERT_OK(db_->Put(write_opts, "foo", ts, "bar"));
|
2020-11-03 17:44:21 +00:00
|
|
|
|
|
|
|
// Read with MultiGet
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
ts = Timestamp(2, 0);
|
|
|
|
Slice read_ts = ts;
|
2020-11-03 17:44:21 +00:00
|
|
|
ReadOptions read_opts;
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
read_opts.timestamp = &read_ts;
|
2020-11-03 17:44:21 +00:00
|
|
|
size_t batch_size = 1;
|
|
|
|
std::vector<Slice> keys(batch_size);
|
|
|
|
std::vector<PinnableSlice> values(batch_size);
|
|
|
|
std::vector<Status> statuses(batch_size);
|
|
|
|
keys[0] = "foo";
|
|
|
|
ColumnFamilyHandle* cfh = db_->DefaultColumnFamily();
|
|
|
|
db_->MultiGet(read_opts, cfh, batch_size, keys.data(), values.data(),
|
|
|
|
statuses.data());
|
|
|
|
|
|
|
|
ASSERT_OK(statuses[0]);
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBBasicTestWithTimestamp, MultiGetRangeFiltering) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.env = env_;
|
|
|
|
options.create_if_missing = true;
|
|
|
|
BlockBasedTableOptions bbto;
|
|
|
|
bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
|
|
|
|
bbto.cache_index_and_filter_blocks = true;
|
|
|
|
bbto.whole_key_filtering = false;
|
|
|
|
options.memtable_prefix_bloom_size_ratio = 0.1;
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(bbto));
|
|
|
|
const size_t kTimestampSize = Timestamp(0, 0).size();
|
|
|
|
TestComparator test_cmp(kTimestampSize);
|
|
|
|
options.comparator = &test_cmp;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
// Write any value
|
|
|
|
WriteOptions write_opts;
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
std::string ts = Timestamp(1, 0);
|
2020-11-03 17:44:21 +00:00
|
|
|
|
|
|
|
// random data
|
|
|
|
for (int i = 0; i < 3; i++) {
|
2022-05-06 20:03:58 +00:00
|
|
|
auto key = std::to_string(i * 10);
|
|
|
|
auto value = std::to_string(i * 10);
|
2020-11-03 17:44:21 +00:00
|
|
|
Slice key_slice = key;
|
|
|
|
Slice value_slice = value;
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
ASSERT_OK(db_->Put(write_opts, key_slice, ts, value_slice));
|
2020-12-01 22:05:19 +00:00
|
|
|
ASSERT_OK(Flush());
|
2020-11-03 17:44:21 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Make num_levels to 2 to do key range filtering of sst files
|
|
|
|
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
|
|
|
|
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
ASSERT_OK(db_->Put(write_opts, "foo", ts, "bar"));
|
2020-11-03 17:44:21 +00:00
|
|
|
|
2020-12-01 22:05:19 +00:00
|
|
|
ASSERT_OK(Flush());
|
2020-11-03 17:44:21 +00:00
|
|
|
|
|
|
|
// Read with MultiGet
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
ts = Timestamp(2, 0);
|
|
|
|
Slice read_ts = ts;
|
2020-11-03 17:44:21 +00:00
|
|
|
ReadOptions read_opts;
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
read_opts.timestamp = &read_ts;
|
2020-11-03 17:44:21 +00:00
|
|
|
size_t batch_size = 1;
|
|
|
|
std::vector<Slice> keys(batch_size);
|
|
|
|
std::vector<PinnableSlice> values(batch_size);
|
|
|
|
std::vector<Status> statuses(batch_size);
|
|
|
|
keys[0] = "foo";
|
|
|
|
ColumnFamilyHandle* cfh = db_->DefaultColumnFamily();
|
|
|
|
db_->MultiGet(read_opts, cfh, batch_size, keys.data(), values.data(),
|
|
|
|
statuses.data());
|
|
|
|
|
|
|
|
ASSERT_OK(statuses[0]);
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
2023-09-20 18:34:38 +00:00
|
|
|
TEST_F(DBBasicTestWithTimestamp, GetWithRowCache) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.env = env_;
|
|
|
|
options.create_if_missing = true;
|
|
|
|
options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
|
|
|
|
LRUCacheOptions cache_options;
|
|
|
|
cache_options.capacity = 8192;
|
|
|
|
options.row_cache = cache_options.MakeSharedRowCache();
|
|
|
|
|
|
|
|
const size_t kTimestampSize = Timestamp(0, 0).size();
|
|
|
|
TestComparator test_cmp(kTimestampSize);
|
|
|
|
options.comparator = &test_cmp;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
WriteOptions write_opts;
|
|
|
|
std::string ts_early = Timestamp(1, 0);
|
|
|
|
std::string ts_later = Timestamp(10, 0);
|
|
|
|
Slice ts_later_slice = ts_later;
|
|
|
|
|
|
|
|
const Snapshot* snap_with_nothing = db_->GetSnapshot();
|
|
|
|
ASSERT_OK(db_->Put(write_opts, "foo", ts_early, "bar"));
|
2023-11-22 04:39:33 +00:00
|
|
|
ASSERT_OK(db_->Put(write_opts, "foo2", ts_early, "bar2"));
|
|
|
|
ASSERT_OK(db_->Put(write_opts, "foo3", ts_early, "bar3"));
|
2023-09-20 18:34:38 +00:00
|
|
|
|
2023-11-22 04:39:33 +00:00
|
|
|
const Snapshot* snap_with_foo = db_->GetSnapshot();
|
2023-09-20 18:34:38 +00:00
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
ReadOptions read_opts;
|
|
|
|
read_opts.timestamp = &ts_later_slice;
|
|
|
|
|
|
|
|
std::string read_value;
|
|
|
|
std::string read_ts;
|
2023-11-22 04:39:33 +00:00
|
|
|
Status s;
|
2023-09-20 18:34:38 +00:00
|
|
|
|
2023-11-22 04:39:33 +00:00
|
|
|
int expected_hit_count = 0;
|
|
|
|
int expected_miss_count = 0;
|
|
|
|
ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), expected_hit_count);
|
|
|
|
ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), expected_miss_count);
|
|
|
|
|
|
|
|
{
|
|
|
|
read_opts.timestamp = nullptr;
|
|
|
|
s = db_->Get(read_opts, "foo", &read_value);
|
|
|
|
ASSERT_NOK(s);
|
|
|
|
ASSERT_TRUE(s.IsInvalidArgument());
|
|
|
|
}
|
|
|
|
|
|
|
|
// Mix use of Get
|
|
|
|
{
|
|
|
|
read_opts.timestamp = &ts_later_slice;
|
|
|
|
|
|
|
|
// Use Get without ts first, expect cache entry to store the correct ts
|
|
|
|
s = db_->Get(read_opts, "foo2", &read_value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), expected_hit_count);
|
|
|
|
ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS),
|
|
|
|
++expected_miss_count);
|
|
|
|
ASSERT_EQ(read_value, "bar2");
|
|
|
|
|
|
|
|
s = db_->Get(read_opts, "foo2", &read_value, &read_ts);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), ++expected_hit_count);
|
|
|
|
ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), expected_miss_count);
|
|
|
|
ASSERT_EQ(read_ts, ts_early);
|
|
|
|
ASSERT_EQ(read_value, "bar2");
|
|
|
|
|
|
|
|
// Use Get with ts first, expect the Get without ts can get correct record
|
|
|
|
s = db_->Get(read_opts, "foo3", &read_value, &read_ts);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), expected_hit_count);
|
|
|
|
ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS),
|
|
|
|
++expected_miss_count);
|
|
|
|
ASSERT_EQ(read_ts, ts_early);
|
|
|
|
ASSERT_EQ(read_value, "bar3");
|
|
|
|
|
|
|
|
s = db_->Get(read_opts, "foo3", &read_value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), ++expected_hit_count);
|
|
|
|
ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), expected_miss_count);
|
|
|
|
ASSERT_EQ(read_value, "bar3");
|
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
// Test with consecutive calls of Get with ts.
|
|
|
|
s = db_->Get(read_opts, "foo", &read_value, &read_ts);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), expected_hit_count);
|
|
|
|
ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS),
|
|
|
|
++expected_miss_count);
|
|
|
|
ASSERT_EQ(read_ts, ts_early);
|
|
|
|
ASSERT_EQ(read_value, "bar");
|
|
|
|
|
|
|
|
// Test repeated get on cache entry
|
|
|
|
for (int i = 0; i < 3; i++) {
|
|
|
|
s = db_->Get(read_opts, "foo", &read_value, &read_ts);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT),
|
|
|
|
++expected_hit_count);
|
|
|
|
ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS),
|
|
|
|
expected_miss_count);
|
|
|
|
ASSERT_EQ(read_ts, ts_early);
|
|
|
|
ASSERT_EQ(read_value, "bar");
|
|
|
|
}
|
|
|
|
}
|
2023-09-20 18:34:38 +00:00
|
|
|
|
|
|
|
{
|
|
|
|
std::string ts_nothing = Timestamp(0, 0);
|
|
|
|
Slice ts_nothing_slice = ts_nothing;
|
|
|
|
read_opts.timestamp = &ts_nothing_slice;
|
|
|
|
s = db_->Get(read_opts, "foo", &read_value, &read_ts);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
2023-11-22 04:39:33 +00:00
|
|
|
ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), expected_hit_count);
|
|
|
|
ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS),
|
|
|
|
++expected_miss_count);
|
2023-09-20 18:34:38 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
read_opts.snapshot = snap_with_foo;
|
2023-11-22 04:39:33 +00:00
|
|
|
read_opts.timestamp = &ts_later_slice;
|
2023-09-20 18:34:38 +00:00
|
|
|
s = db_->Get(read_opts, "foo", &read_value, &read_ts);
|
|
|
|
ASSERT_OK(s);
|
2023-11-22 04:39:33 +00:00
|
|
|
ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), expected_hit_count);
|
|
|
|
ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS),
|
|
|
|
++expected_miss_count);
|
|
|
|
ASSERT_EQ(read_ts, ts_early);
|
|
|
|
ASSERT_EQ(read_value, "bar");
|
2023-09-20 18:34:38 +00:00
|
|
|
|
|
|
|
s = db_->Get(read_opts, "foo", &read_value, &read_ts);
|
|
|
|
ASSERT_OK(s);
|
2023-11-22 04:39:33 +00:00
|
|
|
ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), ++expected_hit_count);
|
|
|
|
ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), expected_miss_count);
|
|
|
|
ASSERT_EQ(read_ts, ts_early);
|
|
|
|
ASSERT_EQ(read_value, "bar");
|
2023-09-20 18:34:38 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
read_opts.snapshot = snap_with_nothing;
|
|
|
|
s = db_->Get(read_opts, "foo", &read_value, &read_ts);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
2023-11-22 04:39:33 +00:00
|
|
|
ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), expected_hit_count);
|
|
|
|
ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS),
|
|
|
|
++expected_miss_count);
|
2023-09-20 18:34:38 +00:00
|
|
|
|
|
|
|
s = db_->Get(read_opts, "foo", &read_value, &read_ts);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
2023-11-22 04:39:33 +00:00
|
|
|
ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), expected_hit_count);
|
|
|
|
ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS),
|
|
|
|
++expected_miss_count);
|
2023-09-20 18:34:38 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
db_->ReleaseSnapshot(snap_with_nothing);
|
|
|
|
db_->ReleaseSnapshot(snap_with_foo);
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
2023-11-22 04:39:33 +00:00
|
|
|
TEST_F(DBBasicTestWithTimestamp, GetWithRowCacheMultiSST) {
|
|
|
|
BlockBasedTableOptions table_options;
|
|
|
|
table_options.block_size = 1;
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.env = env_;
|
|
|
|
options.create_if_missing = true;
|
|
|
|
options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
|
|
|
|
LRUCacheOptions cache_options;
|
|
|
|
cache_options.capacity = 8192;
|
|
|
|
options.row_cache = cache_options.MakeSharedRowCache();
|
|
|
|
|
|
|
|
const size_t kTimestampSize = Timestamp(0, 0).size();
|
|
|
|
TestComparator test_cmp(kTimestampSize);
|
|
|
|
options.comparator = &test_cmp;
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
options.merge_operator = MergeOperators::CreateStringAppendTESTOperator();
|
|
|
|
options.disable_auto_compactions = true;
|
|
|
|
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
std::string ts_early = Timestamp(1, 0);
|
|
|
|
std::string ts_later = Timestamp(10, 0);
|
|
|
|
Slice ts_later_slice = ts_later;
|
|
|
|
|
|
|
|
ASSERT_OK(db_->Put(WriteOptions(), "foo", ts_early, "v1"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
ColumnFamilyHandle* default_cf = db_->DefaultColumnFamily();
|
|
|
|
ASSERT_OK(
|
|
|
|
db_->Merge(WriteOptions(), default_cf, "foo", Timestamp(2, 0), "v2"));
|
|
|
|
ASSERT_OK(
|
|
|
|
db_->Merge(WriteOptions(), default_cf, "foo", Timestamp(3, 0), "v3"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
ReadOptions read_opts;
|
|
|
|
read_opts.timestamp = &ts_later_slice;
|
|
|
|
|
|
|
|
std::string read_value;
|
|
|
|
std::string read_ts;
|
|
|
|
Status s;
|
|
|
|
|
|
|
|
{
|
|
|
|
// Since there are two SST files, will trigger the table lookup twice.
|
|
|
|
s = db_->Get(read_opts, "foo", &read_value, &read_ts);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0);
|
|
|
|
ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2);
|
|
|
|
ASSERT_EQ(read_ts, Timestamp(3, 0));
|
|
|
|
ASSERT_EQ(read_value, "v1,v2,v3");
|
|
|
|
|
|
|
|
s = db_->Get(read_opts, "foo", &read_value, &read_ts);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 2);
|
|
|
|
ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2);
|
|
|
|
ASSERT_EQ(read_ts, Timestamp(3, 0));
|
|
|
|
ASSERT_EQ(read_value, "v1,v2,v3");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Fix a bug in key comparison when index type is kBinarySearchWithFirstKey (#8062)
Summary:
When timestamp is enabled, key comparison should take this into account.
In `BlockBasedTableReader::Get()`, `BlockBasedTableReader::MultiGet()`,
assume the target key is `key`, and the timestamp upper bound is `ts`.
The highest key in current block is (key, ts1), while the lowest key in next
block is (key, ts2).
If
```
ts1 > ts > ts2
```
then
```
(key, ts1) < (key, ts) < (key, ts2)
```
It can be shown that if `Compare()` is used, then we will mistakenly skip the next
block. Instead, we should use `CompareWithoutTimestamp()`.
The majority of this PR makes some existing tests in `db_with_timestamp_basic_test.cc`
parameterized so that different index types can be tested. A new unit test is
also added for more coverage.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8062
Test Plan: make check
Reviewed By: ltamasi
Differential Revision: D27057557
Pulled By: riversand963
fbshipit-source-id: c1062fa7c159ed600a1ad7e461531d52265021f1
2021-03-16 00:43:02 +00:00
|
|
|
TEST_P(DBBasicTestWithTimestampTableOptions, MultiGetPrefixFilter) {
|
2020-11-03 17:44:21 +00:00
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.env = env_;
|
|
|
|
options.create_if_missing = true;
|
2021-12-21 19:29:52 +00:00
|
|
|
options.prefix_extractor.reset(NewCappedPrefixTransform(3));
|
2020-11-03 17:44:21 +00:00
|
|
|
BlockBasedTableOptions bbto;
|
|
|
|
bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
|
|
|
|
bbto.cache_index_and_filter_blocks = true;
|
|
|
|
bbto.whole_key_filtering = false;
|
Fix a bug in key comparison when index type is kBinarySearchWithFirstKey (#8062)
Summary:
When timestamp is enabled, key comparison should take this into account.
In `BlockBasedTableReader::Get()`, `BlockBasedTableReader::MultiGet()`,
assume the target key is `key`, and the timestamp upper bound is `ts`.
The highest key in current block is (key, ts1), while the lowest key in next
block is (key, ts2).
If
```
ts1 > ts > ts2
```
then
```
(key, ts1) < (key, ts) < (key, ts2)
```
It can be shown that if `Compare()` is used, then we will mistakenly skip the next
block. Instead, we should use `CompareWithoutTimestamp()`.
The majority of this PR makes some existing tests in `db_with_timestamp_basic_test.cc`
parameterized so that different index types can be tested. A new unit test is
also added for more coverage.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8062
Test Plan: make check
Reviewed By: ltamasi
Differential Revision: D27057557
Pulled By: riversand963
fbshipit-source-id: c1062fa7c159ed600a1ad7e461531d52265021f1
2021-03-16 00:43:02 +00:00
|
|
|
bbto.index_type = GetParam();
|
2020-11-03 17:44:21 +00:00
|
|
|
options.memtable_prefix_bloom_size_ratio = 0.1;
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(bbto));
|
|
|
|
const size_t kTimestampSize = Timestamp(0, 0).size();
|
|
|
|
TestComparator test_cmp(kTimestampSize);
|
|
|
|
options.comparator = &test_cmp;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
WriteOptions write_opts;
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
std::string ts = Timestamp(1, 0);
|
2020-11-03 17:44:21 +00:00
|
|
|
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
ASSERT_OK(db_->Put(write_opts, "foo", ts, "bar"));
|
2020-11-03 17:44:21 +00:00
|
|
|
|
2020-12-01 22:05:19 +00:00
|
|
|
ASSERT_OK(Flush());
|
2020-11-03 17:44:21 +00:00
|
|
|
// Read with MultiGet
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
ts = Timestamp(2, 0);
|
|
|
|
Slice read_ts = ts;
|
2020-11-03 17:44:21 +00:00
|
|
|
ReadOptions read_opts;
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
read_opts.timestamp = &read_ts;
|
2020-11-03 17:44:21 +00:00
|
|
|
size_t batch_size = 1;
|
|
|
|
std::vector<Slice> keys(batch_size);
|
|
|
|
std::vector<std::string> values(batch_size);
|
|
|
|
std::vector<std::string> timestamps(batch_size);
|
|
|
|
keys[0] = "foo";
|
|
|
|
ColumnFamilyHandle* cfh = db_->DefaultColumnFamily();
|
|
|
|
std::vector<ColumnFamilyHandle*> cfhs(keys.size(), cfh);
|
|
|
|
std::vector<Status> statuses =
|
|
|
|
db_->MultiGet(read_opts, cfhs, keys, &values, ×tamps);
|
|
|
|
|
|
|
|
ASSERT_OK(statuses[0]);
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
2021-03-10 19:13:55 +00:00
|
|
|
TEST_F(DBBasicTestWithTimestamp, MaxKeysSkippedDuringNext) {
|
2020-03-13 18:33:04 +00:00
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.env = env_;
|
|
|
|
options.create_if_missing = true;
|
|
|
|
const size_t kTimestampSize = Timestamp(0, 0).size();
|
|
|
|
TestComparator test_cmp(kTimestampSize);
|
|
|
|
options.comparator = &test_cmp;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
constexpr size_t max_skippable_internal_keys = 2;
|
|
|
|
const size_t kNumKeys = max_skippable_internal_keys + 2;
|
|
|
|
WriteOptions write_opts;
|
|
|
|
Status s;
|
|
|
|
{
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
std::string ts = Timestamp(1, 0);
|
|
|
|
ASSERT_OK(db_->Put(write_opts, "a", ts, "value"));
|
2020-03-13 18:33:04 +00:00
|
|
|
}
|
|
|
|
for (size_t i = 0; i < kNumKeys; ++i) {
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
std::string ts = Timestamp(static_cast<uint64_t>(i + 1), 0);
|
|
|
|
s = db_->Put(write_opts, "b", ts, "value" + std::to_string(i));
|
2020-03-13 18:33:04 +00:00
|
|
|
ASSERT_OK(s);
|
|
|
|
}
|
|
|
|
{
|
|
|
|
ReadOptions read_opts;
|
|
|
|
read_opts.max_skippable_internal_keys = max_skippable_internal_keys;
|
|
|
|
std::string ts_str = Timestamp(1, 0);
|
|
|
|
Slice ts = ts_str;
|
|
|
|
read_opts.timestamp = &ts;
|
|
|
|
std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
|
|
|
|
iter->SeekToFirst();
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_TRUE(iter->status().IsIncomplete());
|
|
|
|
}
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
2021-03-10 19:13:55 +00:00
|
|
|
TEST_F(DBBasicTestWithTimestamp, MaxKeysSkippedDuringPrev) {
|
|
|
|
Options options = GetDefaultOptions();
|
|
|
|
options.env = env_;
|
|
|
|
options.create_if_missing = true;
|
|
|
|
const size_t kTimestampSize = Timestamp(0, 0).size();
|
|
|
|
TestComparator test_cmp(kTimestampSize);
|
|
|
|
options.comparator = &test_cmp;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
constexpr size_t max_skippable_internal_keys = 2;
|
|
|
|
const size_t kNumKeys = max_skippable_internal_keys + 2;
|
|
|
|
WriteOptions write_opts;
|
|
|
|
Status s;
|
|
|
|
{
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
std::string ts = Timestamp(1, 0);
|
|
|
|
ASSERT_OK(db_->Put(write_opts, "b", ts, "value"));
|
2021-03-10 19:13:55 +00:00
|
|
|
}
|
|
|
|
for (size_t i = 0; i < kNumKeys; ++i) {
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
std::string ts = Timestamp(static_cast<uint64_t>(i + 1), 0);
|
|
|
|
s = db_->Put(write_opts, "a", ts, "value" + std::to_string(i));
|
2021-03-10 19:13:55 +00:00
|
|
|
ASSERT_OK(s);
|
|
|
|
}
|
|
|
|
{
|
|
|
|
ReadOptions read_opts;
|
|
|
|
read_opts.max_skippable_internal_keys = max_skippable_internal_keys;
|
|
|
|
std::string ts_str = Timestamp(1, 0);
|
|
|
|
Slice ts = ts_str;
|
|
|
|
read_opts.timestamp = &ts;
|
|
|
|
std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
|
|
|
|
iter->SeekToLast();
|
|
|
|
iter->Prev();
|
|
|
|
ASSERT_TRUE(iter->status().IsIncomplete());
|
|
|
|
}
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
2020-05-28 17:37:57 +00:00
|
|
|
// Create two L0, and compact them to a new L1. In this test, L1 is L_bottom.
|
|
|
|
// Two L0s:
|
|
|
|
// f1 f2
|
|
|
|
// <a, 1, kTypeValue> <a, 3, kTypeDeletionWithTimestamp>...<b, 2, kTypeValue>
|
|
|
|
// Since f2.smallest < f1.largest < f2.largest
|
|
|
|
// f1 and f2 will be the inputs of a real compaction instead of trivial move.
|
|
|
|
TEST_F(DBBasicTestWithTimestamp, CompactDeletionWithTimestampMarkerToBottom) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.env = env_;
|
|
|
|
options.create_if_missing = true;
|
|
|
|
const size_t kTimestampSize = Timestamp(0, 0).size();
|
|
|
|
TestComparator test_cmp(kTimestampSize);
|
|
|
|
options.comparator = &test_cmp;
|
|
|
|
options.num_levels = 2;
|
|
|
|
options.level0_file_num_compaction_trigger = 2;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
WriteOptions write_opts;
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
std::string ts = Timestamp(1, 0);
|
|
|
|
ASSERT_OK(db_->Put(write_opts, "a", ts, "value0"));
|
2020-05-28 17:37:57 +00:00
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
ts = Timestamp(2, 0);
|
|
|
|
ASSERT_OK(db_->Put(write_opts, "b", ts, "value0"));
|
|
|
|
ts = Timestamp(3, 0);
|
|
|
|
ASSERT_OK(db_->Delete(write_opts, "a", ts));
|
2020-05-28 17:37:57 +00:00
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
|
|
|
|
ReadOptions read_opts;
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
ts = Timestamp(1, 0);
|
|
|
|
Slice read_ts = ts;
|
|
|
|
read_opts.timestamp = &read_ts;
|
2020-05-28 17:37:57 +00:00
|
|
|
std::string value;
|
|
|
|
Status s = db_->Get(read_opts, "a", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ("value0", value);
|
|
|
|
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
ts = Timestamp(3, 0);
|
|
|
|
read_ts = ts;
|
|
|
|
read_opts.timestamp = &read_ts;
|
2022-06-04 03:00:42 +00:00
|
|
|
std::string key_ts;
|
|
|
|
s = db_->Get(read_opts, "a", &value, &key_ts);
|
2020-05-28 17:37:57 +00:00
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
2022-06-04 03:00:42 +00:00
|
|
|
ASSERT_EQ(Timestamp(3, 0), key_ts);
|
2020-05-28 17:37:57 +00:00
|
|
|
|
|
|
|
// Time-travel to the past before deletion
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
ts = Timestamp(2, 0);
|
|
|
|
read_ts = ts;
|
|
|
|
read_opts.timestamp = &read_ts;
|
2020-05-28 17:37:57 +00:00
|
|
|
s = db_->Get(read_opts, "a", &value);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ("value0", value);
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
2021-07-07 18:13:09 +00:00
|
|
|
#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
|
2020-12-01 22:05:19 +00:00
|
|
|
class DBBasicTestWithTimestampFilterPrefixSettings
|
|
|
|
: public DBBasicTestWithTimestampBase,
|
|
|
|
public testing::WithParamInterface<
|
|
|
|
std::tuple<std::shared_ptr<const FilterPolicy>, bool, bool,
|
Fix a bug in key comparison when index type is kBinarySearchWithFirstKey (#8062)
Summary:
When timestamp is enabled, key comparison should take this into account.
In `BlockBasedTableReader::Get()`, `BlockBasedTableReader::MultiGet()`,
assume the target key is `key`, and the timestamp upper bound is `ts`.
The highest key in current block is (key, ts1), while the lowest key in next
block is (key, ts2).
If
```
ts1 > ts > ts2
```
then
```
(key, ts1) < (key, ts) < (key, ts2)
```
It can be shown that if `Compare()` is used, then we will mistakenly skip the next
block. Instead, we should use `CompareWithoutTimestamp()`.
The majority of this PR makes some existing tests in `db_with_timestamp_basic_test.cc`
parameterized so that different index types can be tested. A new unit test is
also added for more coverage.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8062
Test Plan: make check
Reviewed By: ltamasi
Differential Revision: D27057557
Pulled By: riversand963
fbshipit-source-id: c1062fa7c159ed600a1ad7e461531d52265021f1
2021-03-16 00:43:02 +00:00
|
|
|
std::shared_ptr<const SliceTransform>, bool, double,
|
|
|
|
BlockBasedTableOptions::IndexType>> {
|
2020-12-01 22:05:19 +00:00
|
|
|
public:
|
|
|
|
DBBasicTestWithTimestampFilterPrefixSettings()
|
|
|
|
: DBBasicTestWithTimestampBase(
|
|
|
|
"db_basic_test_with_timestamp_filter_prefix") {}
|
|
|
|
};
|
|
|
|
|
|
|
|
TEST_P(DBBasicTestWithTimestampFilterPrefixSettings, GetAndMultiGet) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.env = env_;
|
|
|
|
options.create_if_missing = true;
|
|
|
|
BlockBasedTableOptions bbto;
|
|
|
|
bbto.filter_policy = std::get<0>(GetParam());
|
|
|
|
bbto.whole_key_filtering = std::get<1>(GetParam());
|
|
|
|
bbto.cache_index_and_filter_blocks = std::get<2>(GetParam());
|
Fix a bug in key comparison when index type is kBinarySearchWithFirstKey (#8062)
Summary:
When timestamp is enabled, key comparison should take this into account.
In `BlockBasedTableReader::Get()`, `BlockBasedTableReader::MultiGet()`,
assume the target key is `key`, and the timestamp upper bound is `ts`.
The highest key in current block is (key, ts1), while the lowest key in next
block is (key, ts2).
If
```
ts1 > ts > ts2
```
then
```
(key, ts1) < (key, ts) < (key, ts2)
```
It can be shown that if `Compare()` is used, then we will mistakenly skip the next
block. Instead, we should use `CompareWithoutTimestamp()`.
The majority of this PR makes some existing tests in `db_with_timestamp_basic_test.cc`
parameterized so that different index types can be tested. A new unit test is
also added for more coverage.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8062
Test Plan: make check
Reviewed By: ltamasi
Differential Revision: D27057557
Pulled By: riversand963
fbshipit-source-id: c1062fa7c159ed600a1ad7e461531d52265021f1
2021-03-16 00:43:02 +00:00
|
|
|
bbto.index_type = std::get<6>(GetParam());
|
2020-12-01 22:05:19 +00:00
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(bbto));
|
|
|
|
options.prefix_extractor = std::get<3>(GetParam());
|
|
|
|
options.memtable_whole_key_filtering = std::get<4>(GetParam());
|
|
|
|
options.memtable_prefix_bloom_size_ratio = std::get<5>(GetParam());
|
|
|
|
|
|
|
|
const size_t kTimestampSize = Timestamp(0, 0).size();
|
|
|
|
TestComparator test_cmp(kTimestampSize);
|
|
|
|
options.comparator = &test_cmp;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
const int kMaxKey = 1000;
|
|
|
|
|
|
|
|
// Write any value
|
|
|
|
WriteOptions write_opts;
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
std::string ts = Timestamp(1, 0);
|
2020-12-01 22:05:19 +00:00
|
|
|
|
|
|
|
int idx = 0;
|
|
|
|
for (; idx < kMaxKey / 4; idx++) {
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
ASSERT_OK(db_->Put(write_opts, Key1(idx), ts, "bar"));
|
|
|
|
ASSERT_OK(db_->Put(write_opts, KeyWithPrefix("foo", idx), ts, "bar"));
|
2020-12-01 22:05:19 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
|
|
|
|
|
|
|
|
for (; idx < kMaxKey / 2; idx++) {
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
ASSERT_OK(db_->Put(write_opts, Key1(idx), ts, "bar"));
|
|
|
|
ASSERT_OK(db_->Put(write_opts, KeyWithPrefix("foo", idx), ts, "bar"));
|
2020-12-01 22:05:19 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
for (; idx < kMaxKey; idx++) {
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
ASSERT_OK(db_->Put(write_opts, Key1(idx), ts, "bar"));
|
|
|
|
ASSERT_OK(db_->Put(write_opts, KeyWithPrefix("foo", idx), ts, "bar"));
|
2020-12-01 22:05:19 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Read with MultiGet
|
|
|
|
ReadOptions read_opts;
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
Slice read_ts = ts;
|
|
|
|
read_opts.timestamp = &read_ts;
|
2020-12-01 22:05:19 +00:00
|
|
|
|
|
|
|
for (idx = 0; idx < kMaxKey; idx++) {
|
|
|
|
size_t batch_size = 4;
|
|
|
|
std::vector<std::string> keys_str(batch_size);
|
|
|
|
std::vector<PinnableSlice> values(batch_size);
|
|
|
|
std::vector<Status> statuses(batch_size);
|
|
|
|
ColumnFamilyHandle* cfh = db_->DefaultColumnFamily();
|
|
|
|
|
|
|
|
keys_str[0] = Key1(idx);
|
|
|
|
keys_str[1] = KeyWithPrefix("foo", idx);
|
|
|
|
keys_str[2] = Key1(kMaxKey + idx);
|
|
|
|
keys_str[3] = KeyWithPrefix("foo", kMaxKey + idx);
|
|
|
|
|
|
|
|
auto keys = ConvertStrToSlice(keys_str);
|
|
|
|
|
|
|
|
db_->MultiGet(read_opts, cfh, batch_size, keys.data(), values.data(),
|
|
|
|
statuses.data());
|
|
|
|
|
|
|
|
for (int i = 0; i < 2; i++) {
|
|
|
|
ASSERT_OK(statuses[i]);
|
|
|
|
}
|
|
|
|
for (int i = 2; i < 4; i++) {
|
|
|
|
ASSERT_TRUE(statuses[i].IsNotFound());
|
|
|
|
}
|
|
|
|
|
|
|
|
for (int i = 0; i < 2; i++) {
|
|
|
|
std::string value;
|
|
|
|
ASSERT_OK(db_->Get(read_opts, keys[i], &value));
|
|
|
|
std::unique_ptr<Iterator> it1(db_->NewIterator(read_opts));
|
|
|
|
ASSERT_NE(nullptr, it1);
|
|
|
|
ASSERT_OK(it1->status());
|
2022-06-04 03:00:42 +00:00
|
|
|
it1->Seek(keys[i]);
|
|
|
|
ASSERT_TRUE(it1->Valid());
|
2020-12-01 22:05:19 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
for (int i = 2; i < 4; i++) {
|
|
|
|
std::string value;
|
|
|
|
Status s = db_->Get(read_opts, keys[i], &value);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
|
|
|
INSTANTIATE_TEST_CASE_P(
|
|
|
|
Timestamp, DBBasicTestWithTimestampFilterPrefixSettings,
|
|
|
|
::testing::Combine(
|
|
|
|
::testing::Values(
|
|
|
|
std::shared_ptr<const FilterPolicy>(nullptr),
|
|
|
|
std::shared_ptr<const FilterPolicy>(NewBloomFilterPolicy(10, true)),
|
|
|
|
std::shared_ptr<const FilterPolicy>(NewBloomFilterPolicy(10,
|
|
|
|
false))),
|
|
|
|
::testing::Bool(), ::testing::Bool(),
|
|
|
|
::testing::Values(
|
|
|
|
std::shared_ptr<const SliceTransform>(NewFixedPrefixTransform(1)),
|
|
|
|
std::shared_ptr<const SliceTransform>(NewFixedPrefixTransform(4)),
|
|
|
|
std::shared_ptr<const SliceTransform>(NewFixedPrefixTransform(7)),
|
|
|
|
std::shared_ptr<const SliceTransform>(NewFixedPrefixTransform(8))),
|
Fix a bug in key comparison when index type is kBinarySearchWithFirstKey (#8062)
Summary:
When timestamp is enabled, key comparison should take this into account.
In `BlockBasedTableReader::Get()`, `BlockBasedTableReader::MultiGet()`,
assume the target key is `key`, and the timestamp upper bound is `ts`.
The highest key in current block is (key, ts1), while the lowest key in next
block is (key, ts2).
If
```
ts1 > ts > ts2
```
then
```
(key, ts1) < (key, ts) < (key, ts2)
```
It can be shown that if `Compare()` is used, then we will mistakenly skip the next
block. Instead, we should use `CompareWithoutTimestamp()`.
The majority of this PR makes some existing tests in `db_with_timestamp_basic_test.cc`
parameterized so that different index types can be tested. A new unit test is
also added for more coverage.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8062
Test Plan: make check
Reviewed By: ltamasi
Differential Revision: D27057557
Pulled By: riversand963
fbshipit-source-id: c1062fa7c159ed600a1ad7e461531d52265021f1
2021-03-16 00:43:02 +00:00
|
|
|
::testing::Bool(), ::testing::Values(0, 0.1),
|
|
|
|
::testing::Values(
|
|
|
|
BlockBasedTableOptions::IndexType::kBinarySearch,
|
|
|
|
BlockBasedTableOptions::IndexType::kHashSearch,
|
|
|
|
BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch,
|
|
|
|
BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey)));
|
2021-07-07 18:13:09 +00:00
|
|
|
#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
|
2020-12-01 22:05:19 +00:00
|
|
|
|
2020-08-15 02:19:36 +00:00
|
|
|
class DataVisibilityTest : public DBBasicTestWithTimestampBase {
|
|
|
|
public:
|
2020-09-25 16:40:51 +00:00
|
|
|
DataVisibilityTest() : DBBasicTestWithTimestampBase("data_visibility_test") {
|
|
|
|
// Initialize test data
|
|
|
|
for (int i = 0; i < kTestDataSize; i++) {
|
2022-05-06 20:03:58 +00:00
|
|
|
test_data_[i].key = "key" + std::to_string(i);
|
|
|
|
test_data_[i].value = "value" + std::to_string(i);
|
2020-09-25 16:40:51 +00:00
|
|
|
test_data_[i].timestamp = Timestamp(i, 0);
|
|
|
|
test_data_[i].ts = i;
|
|
|
|
test_data_[i].seq_num = kMaxSequenceNumber;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
protected:
|
|
|
|
struct TestData {
|
|
|
|
std::string key;
|
|
|
|
std::string value;
|
|
|
|
int ts;
|
|
|
|
std::string timestamp;
|
|
|
|
SequenceNumber seq_num;
|
|
|
|
};
|
|
|
|
|
|
|
|
constexpr static int kTestDataSize = 3;
|
|
|
|
TestData test_data_[kTestDataSize];
|
|
|
|
|
|
|
|
void PutTestData(int index, ColumnFamilyHandle* cfh = nullptr) {
|
|
|
|
ASSERT_LE(index, kTestDataSize);
|
|
|
|
WriteOptions write_opts;
|
|
|
|
|
|
|
|
if (cfh == nullptr) {
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
ASSERT_OK(db_->Put(write_opts, test_data_[index].key,
|
|
|
|
test_data_[index].timestamp, test_data_[index].value));
|
2020-09-25 16:40:51 +00:00
|
|
|
const Snapshot* snap = db_->GetSnapshot();
|
|
|
|
test_data_[index].seq_num = snap->GetSequenceNumber();
|
|
|
|
if (index > 0) {
|
|
|
|
ASSERT_GT(test_data_[index].seq_num, test_data_[index - 1].seq_num);
|
|
|
|
}
|
|
|
|
db_->ReleaseSnapshot(snap);
|
|
|
|
} else {
|
|
|
|
ASSERT_OK(db_->Put(write_opts, cfh, test_data_[index].key,
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
test_data_[index].timestamp, test_data_[index].value));
|
2020-09-25 16:40:51 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void AssertVisibility(int ts, SequenceNumber seq,
|
|
|
|
std::vector<Status> statuses) {
|
|
|
|
ASSERT_EQ(kTestDataSize, statuses.size());
|
|
|
|
for (int i = 0; i < kTestDataSize; i++) {
|
|
|
|
if (test_data_[i].seq_num <= seq && test_data_[i].ts <= ts) {
|
|
|
|
ASSERT_OK(statuses[i]);
|
|
|
|
} else {
|
|
|
|
ASSERT_TRUE(statuses[i].IsNotFound());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<Slice> GetKeys() {
|
|
|
|
std::vector<Slice> ret(kTestDataSize);
|
|
|
|
for (int i = 0; i < kTestDataSize; i++) {
|
|
|
|
ret[i] = test_data_[i].key;
|
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
void VerifyDefaultCF(int ts, const Snapshot* snap = nullptr) {
|
|
|
|
ReadOptions read_opts;
|
|
|
|
std::string read_ts = Timestamp(ts, 0);
|
|
|
|
Slice read_ts_slice = read_ts;
|
|
|
|
read_opts.timestamp = &read_ts_slice;
|
|
|
|
read_opts.snapshot = snap;
|
|
|
|
|
|
|
|
ColumnFamilyHandle* cfh = db_->DefaultColumnFamily();
|
|
|
|
std::vector<ColumnFamilyHandle*> cfs(kTestDataSize, cfh);
|
|
|
|
SequenceNumber seq =
|
|
|
|
snap ? snap->GetSequenceNumber() : kMaxSequenceNumber - 1;
|
|
|
|
|
|
|
|
// There're several MultiGet interfaces with not exactly the same
|
|
|
|
// implementations, query data with all of them.
|
|
|
|
auto keys = GetKeys();
|
|
|
|
std::vector<std::string> values;
|
|
|
|
auto s1 = db_->MultiGet(read_opts, cfs, keys, &values);
|
|
|
|
AssertVisibility(ts, seq, s1);
|
|
|
|
|
|
|
|
auto s2 = db_->MultiGet(read_opts, keys, &values);
|
|
|
|
AssertVisibility(ts, seq, s2);
|
|
|
|
|
|
|
|
std::vector<std::string> timestamps;
|
|
|
|
auto s3 = db_->MultiGet(read_opts, cfs, keys, &values, ×tamps);
|
|
|
|
AssertVisibility(ts, seq, s3);
|
|
|
|
|
|
|
|
auto s4 = db_->MultiGet(read_opts, keys, &values, ×tamps);
|
|
|
|
AssertVisibility(ts, seq, s4);
|
|
|
|
|
|
|
|
std::vector<PinnableSlice> values_ps5(kTestDataSize);
|
|
|
|
std::vector<Status> s5(kTestDataSize);
|
|
|
|
db_->MultiGet(read_opts, cfh, kTestDataSize, keys.data(), values_ps5.data(),
|
|
|
|
s5.data());
|
|
|
|
AssertVisibility(ts, seq, s5);
|
|
|
|
|
|
|
|
std::vector<PinnableSlice> values_ps6(kTestDataSize);
|
|
|
|
std::vector<Status> s6(kTestDataSize);
|
|
|
|
std::vector<std::string> timestamps_array(kTestDataSize);
|
|
|
|
db_->MultiGet(read_opts, cfh, kTestDataSize, keys.data(), values_ps6.data(),
|
|
|
|
timestamps_array.data(), s6.data());
|
|
|
|
AssertVisibility(ts, seq, s6);
|
|
|
|
|
|
|
|
std::vector<PinnableSlice> values_ps7(kTestDataSize);
|
|
|
|
std::vector<Status> s7(kTestDataSize);
|
|
|
|
db_->MultiGet(read_opts, kTestDataSize, cfs.data(), keys.data(),
|
|
|
|
values_ps7.data(), s7.data());
|
|
|
|
AssertVisibility(ts, seq, s7);
|
|
|
|
|
|
|
|
std::vector<PinnableSlice> values_ps8(kTestDataSize);
|
|
|
|
std::vector<Status> s8(kTestDataSize);
|
|
|
|
db_->MultiGet(read_opts, kTestDataSize, cfs.data(), keys.data(),
|
|
|
|
values_ps8.data(), timestamps_array.data(), s8.data());
|
|
|
|
AssertVisibility(ts, seq, s8);
|
|
|
|
}
|
|
|
|
|
|
|
|
void VerifyDefaultCF(const Snapshot* snap = nullptr) {
|
|
|
|
for (int i = 0; i <= kTestDataSize; i++) {
|
|
|
|
VerifyDefaultCF(i, snap);
|
|
|
|
}
|
|
|
|
}
|
2020-08-15 02:19:36 +00:00
|
|
|
};
|
2020-09-25 16:40:51 +00:00
|
|
|
constexpr int DataVisibilityTest::kTestDataSize;
|
2020-08-15 02:19:36 +00:00
|
|
|
|
|
|
|
// Application specifies timestamp but not snapshot.
|
|
|
|
// reader writer
|
|
|
|
// ts'=90
|
|
|
|
// ts=100
|
|
|
|
// seq=10
|
|
|
|
// seq'=11
|
|
|
|
// write finishes
|
|
|
|
// GetImpl(ts,seq)
|
2022-09-30 23:13:03 +00:00
|
|
|
// It is OK to return <k, t1, s1> if ts>=t1 AND seq>=s1. If ts>=t1 but seq<s1,
|
2020-08-15 02:19:36 +00:00
|
|
|
// the key should not be returned.
|
|
|
|
TEST_F(DataVisibilityTest, PointLookupWithoutSnapshot1) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
const size_t kTimestampSize = Timestamp(0, 0).size();
|
|
|
|
TestComparator test_cmp(kTimestampSize);
|
|
|
|
options.comparator = &test_cmp;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
SyncPoint::GetInstance()->LoadDependency({
|
|
|
|
{"DBImpl::GetImpl:3",
|
|
|
|
"DataVisibilityTest::PointLookupWithoutSnapshot1:BeforePut"},
|
|
|
|
{"DataVisibilityTest::PointLookupWithoutSnapshot1:AfterPut",
|
|
|
|
"DBImpl::GetImpl:4"},
|
|
|
|
});
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
port::Thread writer_thread([this]() {
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
std::string write_ts = Timestamp(1, 0);
|
2020-08-15 02:19:36 +00:00
|
|
|
WriteOptions write_opts;
|
|
|
|
TEST_SYNC_POINT(
|
|
|
|
"DataVisibilityTest::PointLookupWithoutSnapshot1:BeforePut");
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
Status s = db_->Put(write_opts, "foo", write_ts, "value");
|
2020-08-15 02:19:36 +00:00
|
|
|
ASSERT_OK(s);
|
|
|
|
TEST_SYNC_POINT("DataVisibilityTest::PointLookupWithoutSnapshot1:AfterPut");
|
|
|
|
});
|
|
|
|
ReadOptions read_opts;
|
|
|
|
std::string read_ts_str = Timestamp(3, 0);
|
|
|
|
Slice read_ts = read_ts_str;
|
|
|
|
read_opts.timestamp = &read_ts;
|
|
|
|
std::string value;
|
|
|
|
Status s = db_->Get(read_opts, "foo", &value);
|
|
|
|
|
|
|
|
writer_thread.join();
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
|
|
|
// Application specifies timestamp but not snapshot.
|
|
|
|
// reader writer
|
|
|
|
// ts'=90
|
|
|
|
// ts=100
|
|
|
|
// seq=10
|
|
|
|
// seq'=11
|
|
|
|
// write finishes
|
|
|
|
// Flush
|
|
|
|
// GetImpl(ts,seq)
|
|
|
|
// It is OK to return <k, t1, s1> if ts>=t1 AND seq>=s1. If ts>=t1 but seq<s1,
|
|
|
|
// the key should not be returned.
|
|
|
|
TEST_F(DataVisibilityTest, PointLookupWithoutSnapshot2) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
const size_t kTimestampSize = Timestamp(0, 0).size();
|
|
|
|
TestComparator test_cmp(kTimestampSize);
|
|
|
|
options.comparator = &test_cmp;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
SyncPoint::GetInstance()->LoadDependency({
|
|
|
|
{"DBImpl::GetImpl:3",
|
|
|
|
"DataVisibilityTest::PointLookupWithoutSnapshot2:BeforePut"},
|
|
|
|
{"DataVisibilityTest::PointLookupWithoutSnapshot2:AfterPut",
|
|
|
|
"DBImpl::GetImpl:4"},
|
|
|
|
});
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
port::Thread writer_thread([this]() {
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
std::string write_ts = Timestamp(1, 0);
|
2020-08-15 02:19:36 +00:00
|
|
|
WriteOptions write_opts;
|
|
|
|
TEST_SYNC_POINT(
|
|
|
|
"DataVisibilityTest::PointLookupWithoutSnapshot2:BeforePut");
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
Status s = db_->Put(write_opts, "foo", write_ts, "value");
|
2020-08-15 02:19:36 +00:00
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
write_ts = Timestamp(2, 0);
|
|
|
|
s = db_->Put(write_opts, "bar", write_ts, "value");
|
2020-08-15 02:19:36 +00:00
|
|
|
ASSERT_OK(s);
|
|
|
|
TEST_SYNC_POINT("DataVisibilityTest::PointLookupWithoutSnapshot2:AfterPut");
|
|
|
|
});
|
|
|
|
ReadOptions read_opts;
|
|
|
|
std::string read_ts_str = Timestamp(3, 0);
|
|
|
|
Slice read_ts = read_ts_str;
|
|
|
|
read_opts.timestamp = &read_ts;
|
|
|
|
std::string value;
|
|
|
|
Status s = db_->Get(read_opts, "foo", &value);
|
|
|
|
writer_thread.join();
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
|
|
|
// Application specifies both timestamp and snapshot.
|
|
|
|
// reader writer
|
|
|
|
// seq=10
|
|
|
|
// ts'=90
|
|
|
|
// ts=100
|
|
|
|
// seq'=11
|
|
|
|
// write finishes
|
|
|
|
// GetImpl(ts,seq)
|
|
|
|
// Since application specifies both timestamp and snapshot, application expects
|
|
|
|
// to see data that visible in BOTH timestamp and sequence number. Therefore,
|
|
|
|
// <k, t1, s1> can be returned only if t1<=ts AND s1<=seq.
|
|
|
|
TEST_F(DataVisibilityTest, PointLookupWithSnapshot1) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
const size_t kTimestampSize = Timestamp(0, 0).size();
|
|
|
|
TestComparator test_cmp(kTimestampSize);
|
|
|
|
options.comparator = &test_cmp;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
SyncPoint::GetInstance()->LoadDependency({
|
|
|
|
{"DataVisibilityTest::PointLookupWithSnapshot1:AfterTakingSnap",
|
|
|
|
"DataVisibilityTest::PointLookupWithSnapshot1:BeforePut"},
|
|
|
|
{"DataVisibilityTest::PointLookupWithSnapshot1:AfterPut",
|
|
|
|
"DBImpl::GetImpl:1"},
|
|
|
|
});
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
port::Thread writer_thread([this]() {
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
std::string write_ts = Timestamp(1, 0);
|
2020-08-15 02:19:36 +00:00
|
|
|
WriteOptions write_opts;
|
|
|
|
TEST_SYNC_POINT("DataVisibilityTest::PointLookupWithSnapshot1:BeforePut");
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
Status s = db_->Put(write_opts, "foo", write_ts, "value");
|
2020-08-15 02:19:36 +00:00
|
|
|
TEST_SYNC_POINT("DataVisibilityTest::PointLookupWithSnapshot1:AfterPut");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
});
|
|
|
|
ReadOptions read_opts;
|
|
|
|
const Snapshot* snap = db_->GetSnapshot();
|
|
|
|
TEST_SYNC_POINT(
|
|
|
|
"DataVisibilityTest::PointLookupWithSnapshot1:AfterTakingSnap");
|
|
|
|
read_opts.snapshot = snap;
|
|
|
|
std::string read_ts_str = Timestamp(3, 0);
|
|
|
|
Slice read_ts = read_ts_str;
|
|
|
|
read_opts.timestamp = &read_ts;
|
|
|
|
std::string value;
|
|
|
|
Status s = db_->Get(read_opts, "foo", &value);
|
|
|
|
writer_thread.join();
|
|
|
|
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
|
|
|
|
db_->ReleaseSnapshot(snap);
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
|
|
|
// Application specifies both timestamp and snapshot.
|
|
|
|
// reader writer
|
|
|
|
// seq=10
|
|
|
|
// ts'=90
|
|
|
|
// ts=100
|
|
|
|
// seq'=11
|
|
|
|
// write finishes
|
|
|
|
// Flush
|
|
|
|
// GetImpl(ts,seq)
|
|
|
|
// Since application specifies both timestamp and snapshot, application expects
|
|
|
|
// to see data that visible in BOTH timestamp and sequence number. Therefore,
|
|
|
|
// <k, t1, s1> can be returned only if t1<=ts AND s1<=seq.
|
|
|
|
TEST_F(DataVisibilityTest, PointLookupWithSnapshot2) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
const size_t kTimestampSize = Timestamp(0, 0).size();
|
|
|
|
TestComparator test_cmp(kTimestampSize);
|
|
|
|
options.comparator = &test_cmp;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
SyncPoint::GetInstance()->LoadDependency({
|
|
|
|
{"DataVisibilityTest::PointLookupWithSnapshot2:AfterTakingSnap",
|
|
|
|
"DataVisibilityTest::PointLookupWithSnapshot2:BeforePut"},
|
|
|
|
});
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
port::Thread writer_thread([this]() {
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
std::string write_ts = Timestamp(1, 0);
|
2020-08-15 02:19:36 +00:00
|
|
|
WriteOptions write_opts;
|
|
|
|
TEST_SYNC_POINT("DataVisibilityTest::PointLookupWithSnapshot2:BeforePut");
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
Status s = db_->Put(write_opts, "foo", write_ts, "value1");
|
2020-08-15 02:19:36 +00:00
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
write_ts = Timestamp(2, 0);
|
|
|
|
s = db_->Put(write_opts, "bar", write_ts, "value2");
|
2020-08-15 02:19:36 +00:00
|
|
|
ASSERT_OK(s);
|
|
|
|
});
|
|
|
|
const Snapshot* snap = db_->GetSnapshot();
|
|
|
|
TEST_SYNC_POINT(
|
|
|
|
"DataVisibilityTest::PointLookupWithSnapshot2:AfterTakingSnap");
|
|
|
|
writer_thread.join();
|
|
|
|
std::string read_ts_str = Timestamp(3, 0);
|
|
|
|
Slice read_ts = read_ts_str;
|
|
|
|
ReadOptions read_opts;
|
|
|
|
read_opts.snapshot = snap;
|
|
|
|
read_opts.timestamp = &read_ts;
|
|
|
|
std::string value;
|
|
|
|
Status s = db_->Get(read_opts, "foo", &value);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
db_->ReleaseSnapshot(snap);
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
|
|
|
// Application specifies timestamp but not snapshot.
|
|
|
|
// reader writer
|
|
|
|
// ts'=90
|
|
|
|
// ts=100
|
|
|
|
// seq=10
|
|
|
|
// seq'=11
|
|
|
|
// write finishes
|
|
|
|
// scan(ts,seq)
|
|
|
|
// <k, t1, s1> can be seen in scan as long as ts>=t1 AND seq>=s1. If ts>=t1 but
|
|
|
|
// seq<s1, then the key should not be returned.
|
|
|
|
TEST_F(DataVisibilityTest, RangeScanWithoutSnapshot) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
const size_t kTimestampSize = Timestamp(0, 0).size();
|
|
|
|
TestComparator test_cmp(kTimestampSize);
|
|
|
|
options.comparator = &test_cmp;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
SyncPoint::GetInstance()->LoadDependency({
|
|
|
|
{"DBImpl::NewIterator:3",
|
|
|
|
"DataVisibilityTest::RangeScanWithoutSnapshot:BeforePut"},
|
|
|
|
});
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
port::Thread writer_thread([this]() {
|
|
|
|
WriteOptions write_opts;
|
|
|
|
TEST_SYNC_POINT("DataVisibilityTest::RangeScanWithoutSnapshot:BeforePut");
|
|
|
|
for (int i = 0; i < 3; ++i) {
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
std::string write_ts = Timestamp(i + 1, 0);
|
|
|
|
Status s = db_->Put(write_opts, "key" + std::to_string(i), write_ts,
|
2020-08-15 02:19:36 +00:00
|
|
|
"value" + std::to_string(i));
|
|
|
|
ASSERT_OK(s);
|
|
|
|
}
|
|
|
|
});
|
|
|
|
std::string read_ts_str = Timestamp(10, 0);
|
|
|
|
Slice read_ts = read_ts_str;
|
|
|
|
ReadOptions read_opts;
|
|
|
|
read_opts.total_order_seek = true;
|
|
|
|
read_opts.timestamp = &read_ts;
|
|
|
|
Iterator* it = db_->NewIterator(read_opts);
|
|
|
|
ASSERT_NE(nullptr, it);
|
|
|
|
writer_thread.join();
|
|
|
|
it->SeekToFirst();
|
|
|
|
ASSERT_FALSE(it->Valid());
|
|
|
|
delete it;
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
|
|
|
// Application specifies both timestamp and snapshot.
|
|
|
|
// reader writer
|
|
|
|
// seq=10
|
|
|
|
// ts'=90
|
|
|
|
// ts=100 seq'=11
|
|
|
|
// write finishes
|
|
|
|
// scan(ts,seq)
|
|
|
|
// <k, t1, s1> can be seen by the scan only if t1<=ts AND s1<=seq. If t1<=ts
|
|
|
|
// but s1>seq, then the key should not be returned.
|
|
|
|
TEST_F(DataVisibilityTest, RangeScanWithSnapshot) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
const size_t kTimestampSize = Timestamp(0, 0).size();
|
|
|
|
TestComparator test_cmp(kTimestampSize);
|
|
|
|
options.comparator = &test_cmp;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
SyncPoint::GetInstance()->LoadDependency({
|
|
|
|
{"DataVisibilityTest::RangeScanWithSnapshot:AfterTakingSnapshot",
|
|
|
|
"DataVisibilityTest::RangeScanWithSnapshot:BeforePut"},
|
|
|
|
});
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
port::Thread writer_thread([this]() {
|
|
|
|
WriteOptions write_opts;
|
|
|
|
TEST_SYNC_POINT("DataVisibilityTest::RangeScanWithSnapshot:BeforePut");
|
|
|
|
for (int i = 0; i < 3; ++i) {
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
std::string write_ts = Timestamp(i + 1, 0);
|
|
|
|
Status s = db_->Put(write_opts, "key" + std::to_string(i), write_ts,
|
2020-08-15 02:19:36 +00:00
|
|
|
"value" + std::to_string(i));
|
|
|
|
ASSERT_OK(s);
|
|
|
|
}
|
|
|
|
});
|
|
|
|
const Snapshot* snap = db_->GetSnapshot();
|
|
|
|
TEST_SYNC_POINT(
|
|
|
|
"DataVisibilityTest::RangeScanWithSnapshot:AfterTakingSnapshot");
|
|
|
|
|
|
|
|
writer_thread.join();
|
|
|
|
|
|
|
|
std::string read_ts_str = Timestamp(10, 0);
|
|
|
|
Slice read_ts = read_ts_str;
|
|
|
|
ReadOptions read_opts;
|
|
|
|
read_opts.snapshot = snap;
|
|
|
|
read_opts.total_order_seek = true;
|
|
|
|
read_opts.timestamp = &read_ts;
|
|
|
|
Iterator* it = db_->NewIterator(read_opts);
|
|
|
|
ASSERT_NE(nullptr, it);
|
|
|
|
it->Seek("key0");
|
|
|
|
ASSERT_FALSE(it->Valid());
|
|
|
|
|
|
|
|
delete it;
|
|
|
|
db_->ReleaseSnapshot(snap);
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
2020-09-25 16:40:51 +00:00
|
|
|
// Application specifies both timestamp and snapshot.
|
|
|
|
// Query each combination and make sure for MultiGet key <k, t1, s1>, only
|
|
|
|
// return keys that ts>=t1 AND seq>=s1.
|
|
|
|
TEST_F(DataVisibilityTest, MultiGetWithTimestamp) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
const size_t kTimestampSize = Timestamp(0, 0).size();
|
|
|
|
TestComparator test_cmp(kTimestampSize);
|
|
|
|
options.comparator = &test_cmp;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
const Snapshot* snap0 = db_->GetSnapshot();
|
|
|
|
PutTestData(0);
|
|
|
|
VerifyDefaultCF();
|
|
|
|
VerifyDefaultCF(snap0);
|
|
|
|
|
|
|
|
const Snapshot* snap1 = db_->GetSnapshot();
|
|
|
|
PutTestData(1);
|
|
|
|
VerifyDefaultCF();
|
|
|
|
VerifyDefaultCF(snap0);
|
|
|
|
VerifyDefaultCF(snap1);
|
|
|
|
|
2020-12-01 22:05:19 +00:00
|
|
|
ASSERT_OK(Flush());
|
2020-09-25 16:40:51 +00:00
|
|
|
|
|
|
|
const Snapshot* snap2 = db_->GetSnapshot();
|
|
|
|
PutTestData(2);
|
|
|
|
VerifyDefaultCF();
|
|
|
|
VerifyDefaultCF(snap0);
|
|
|
|
VerifyDefaultCF(snap1);
|
|
|
|
VerifyDefaultCF(snap2);
|
|
|
|
|
|
|
|
db_->ReleaseSnapshot(snap0);
|
|
|
|
db_->ReleaseSnapshot(snap1);
|
|
|
|
db_->ReleaseSnapshot(snap2);
|
|
|
|
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
|
|
|
// Application specifies timestamp but not snapshot.
|
|
|
|
// reader writer
|
|
|
|
// ts'=0, 1
|
|
|
|
// ts=3
|
|
|
|
// seq=10
|
|
|
|
// seq'=11, 12
|
|
|
|
// write finishes
|
|
|
|
// MultiGet(ts,seq)
|
|
|
|
// For MultiGet <k, t1, s1>, only return keys that ts>=t1 AND seq>=s1.
|
|
|
|
TEST_F(DataVisibilityTest, MultiGetWithoutSnapshot) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
const size_t kTimestampSize = Timestamp(0, 0).size();
|
|
|
|
TestComparator test_cmp(kTimestampSize);
|
|
|
|
options.comparator = &test_cmp;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
SyncPoint::GetInstance()->LoadDependency({
|
2024-02-16 17:21:06 +00:00
|
|
|
{"DBImpl::MultiCFSnapshot:AfterGetSeqNum1",
|
2020-09-25 16:40:51 +00:00
|
|
|
"DataVisibilityTest::MultiGetWithoutSnapshot:BeforePut"},
|
|
|
|
{"DataVisibilityTest::MultiGetWithoutSnapshot:AfterPut",
|
2024-02-16 17:21:06 +00:00
|
|
|
"DBImpl::MultiCFSnapshot:AfterGetSeqNum2"},
|
2020-09-25 16:40:51 +00:00
|
|
|
});
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
port::Thread writer_thread([this]() {
|
|
|
|
TEST_SYNC_POINT("DataVisibilityTest::MultiGetWithoutSnapshot:BeforePut");
|
|
|
|
PutTestData(0);
|
|
|
|
PutTestData(1);
|
|
|
|
TEST_SYNC_POINT("DataVisibilityTest::MultiGetWithoutSnapshot:AfterPut");
|
|
|
|
});
|
|
|
|
|
|
|
|
ReadOptions read_opts;
|
|
|
|
std::string read_ts = Timestamp(kTestDataSize, 0);
|
|
|
|
Slice read_ts_slice = read_ts;
|
|
|
|
read_opts.timestamp = &read_ts_slice;
|
|
|
|
auto keys = GetKeys();
|
|
|
|
std::vector<std::string> values;
|
|
|
|
auto ss = db_->MultiGet(read_opts, keys, &values);
|
|
|
|
|
|
|
|
writer_thread.join();
|
2024-01-05 19:53:57 +00:00
|
|
|
for (const auto& s : ss) {
|
2020-09-25 16:40:51 +00:00
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
}
|
|
|
|
VerifyDefaultCF();
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DataVisibilityTest, MultiGetCrossCF) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
const size_t kTimestampSize = Timestamp(0, 0).size();
|
|
|
|
TestComparator test_cmp(kTimestampSize);
|
|
|
|
options.comparator = &test_cmp;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
CreateAndReopenWithCF({"second"}, options);
|
|
|
|
ColumnFamilyHandle* second_cf = handles_[1];
|
|
|
|
|
|
|
|
const Snapshot* snap0 = db_->GetSnapshot();
|
|
|
|
PutTestData(0);
|
|
|
|
PutTestData(0, second_cf);
|
|
|
|
VerifyDefaultCF();
|
|
|
|
VerifyDefaultCF(snap0);
|
|
|
|
|
|
|
|
const Snapshot* snap1 = db_->GetSnapshot();
|
|
|
|
PutTestData(1);
|
|
|
|
PutTestData(1, second_cf);
|
|
|
|
VerifyDefaultCF();
|
|
|
|
VerifyDefaultCF(snap0);
|
|
|
|
VerifyDefaultCF(snap1);
|
|
|
|
|
2020-12-01 22:05:19 +00:00
|
|
|
ASSERT_OK(Flush());
|
2020-09-25 16:40:51 +00:00
|
|
|
|
|
|
|
const Snapshot* snap2 = db_->GetSnapshot();
|
|
|
|
PutTestData(2);
|
|
|
|
PutTestData(2, second_cf);
|
|
|
|
VerifyDefaultCF();
|
|
|
|
VerifyDefaultCF(snap0);
|
|
|
|
VerifyDefaultCF(snap1);
|
|
|
|
VerifyDefaultCF(snap2);
|
|
|
|
|
|
|
|
ReadOptions read_opts;
|
|
|
|
std::string read_ts = Timestamp(kTestDataSize, 0);
|
|
|
|
Slice read_ts_slice = read_ts;
|
|
|
|
read_opts.timestamp = &read_ts_slice;
|
|
|
|
read_opts.snapshot = snap1;
|
|
|
|
auto keys = GetKeys();
|
|
|
|
auto keys2 = GetKeys();
|
|
|
|
keys.insert(keys.end(), keys2.begin(), keys2.end());
|
|
|
|
std::vector<ColumnFamilyHandle*> cfs(kTestDataSize,
|
|
|
|
db_->DefaultColumnFamily());
|
|
|
|
std::vector<ColumnFamilyHandle*> cfs2(kTestDataSize, second_cf);
|
|
|
|
cfs.insert(cfs.end(), cfs2.begin(), cfs2.end());
|
|
|
|
|
|
|
|
std::vector<std::string> values;
|
|
|
|
auto ss = db_->MultiGet(read_opts, cfs, keys, &values);
|
|
|
|
for (int i = 0; i < 2 * kTestDataSize; i++) {
|
|
|
|
if (i % 3 == 0) {
|
|
|
|
// only the first key for each column family should be returned
|
|
|
|
ASSERT_OK(ss[i]);
|
|
|
|
} else {
|
|
|
|
ASSERT_TRUE(ss[i].IsNotFound());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
db_->ReleaseSnapshot(snap0);
|
|
|
|
db_->ReleaseSnapshot(snap1);
|
|
|
|
db_->ReleaseSnapshot(snap2);
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
2021-07-07 18:13:09 +00:00
|
|
|
#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
|
2020-03-13 18:33:04 +00:00
|
|
|
class DBBasicTestWithTimestampCompressionSettings
|
|
|
|
: public DBBasicTestWithTimestampBase,
|
2020-04-01 23:37:54 +00:00
|
|
|
public testing::WithParamInterface<
|
|
|
|
std::tuple<std::shared_ptr<const FilterPolicy>, CompressionType,
|
|
|
|
uint32_t, uint32_t>> {
|
2020-03-13 18:33:04 +00:00
|
|
|
public:
|
|
|
|
DBBasicTestWithTimestampCompressionSettings()
|
|
|
|
: DBBasicTestWithTimestampBase(
|
|
|
|
"db_basic_test_with_timestamp_compression") {}
|
|
|
|
};
|
|
|
|
|
|
|
|
TEST_P(DBBasicTestWithTimestampCompressionSettings, PutAndGet) {
|
2020-03-17 17:47:36 +00:00
|
|
|
const int kNumKeysPerFile = 1024;
|
|
|
|
const size_t kNumTimestamps = 4;
|
2020-03-13 18:33:04 +00:00
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.create_if_missing = true;
|
|
|
|
options.env = env_;
|
2021-09-08 14:45:59 +00:00
|
|
|
options.memtable_factory.reset(
|
|
|
|
test::NewSpecialSkipListFactory(kNumKeysPerFile));
|
2020-03-13 18:33:04 +00:00
|
|
|
size_t ts_sz = Timestamp(0, 0).size();
|
|
|
|
TestComparator test_cmp(ts_sz);
|
|
|
|
options.comparator = &test_cmp;
|
|
|
|
BlockBasedTableOptions bbto;
|
|
|
|
bbto.filter_policy = std::get<0>(GetParam());
|
|
|
|
bbto.whole_key_filtering = true;
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(bbto));
|
|
|
|
|
|
|
|
const CompressionType comp_type = std::get<1>(GetParam());
|
|
|
|
#if LZ4_VERSION_NUMBER < 10400 // r124+
|
|
|
|
if (comp_type == kLZ4Compression || comp_type == kLZ4HCCompression) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
#endif // LZ4_VERSION_NUMBER >= 10400
|
|
|
|
if (!ZSTD_Supported() && comp_type == kZSTD) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if (!Zlib_Supported() && comp_type == kZlibCompression) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
options.compression = comp_type;
|
|
|
|
options.compression_opts.max_dict_bytes = std::get<2>(GetParam());
|
|
|
|
if (comp_type == kZSTD) {
|
|
|
|
options.compression_opts.zstd_max_train_bytes = std::get<2>(GetParam());
|
|
|
|
}
|
2020-04-01 23:37:54 +00:00
|
|
|
options.compression_opts.parallel_threads = std::get<3>(GetParam());
|
2020-03-13 18:33:04 +00:00
|
|
|
options.target_file_size_base = 1 << 26; // 64MB
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
CreateAndReopenWithCF({"pikachu"}, options);
|
|
|
|
size_t num_cfs = handles_.size();
|
|
|
|
ASSERT_EQ(2, num_cfs);
|
|
|
|
std::vector<std::string> write_ts_list;
|
|
|
|
std::vector<std::string> read_ts_list;
|
|
|
|
|
|
|
|
for (size_t i = 0; i != kNumTimestamps; ++i) {
|
|
|
|
write_ts_list.push_back(Timestamp(i * 2, 0));
|
|
|
|
read_ts_list.push_back(Timestamp(1 + i * 2, 0));
|
|
|
|
const Slice write_ts = write_ts_list.back();
|
|
|
|
WriteOptions wopts;
|
|
|
|
for (int cf = 0; cf != static_cast<int>(num_cfs); ++cf) {
|
|
|
|
for (size_t j = 0; j != (kNumKeysPerFile - 1) / kNumTimestamps; ++j) {
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
ASSERT_OK(
|
|
|
|
db_->Put(wopts, handles_[cf], Key1(j), write_ts,
|
|
|
|
"value_" + std::to_string(j) + "_" + std::to_string(i)));
|
2020-03-13 18:33:04 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
const auto& verify_db_func = [&]() {
|
|
|
|
for (size_t i = 0; i != kNumTimestamps; ++i) {
|
|
|
|
ReadOptions ropts;
|
|
|
|
const Slice read_ts = read_ts_list[i];
|
|
|
|
ropts.timestamp = &read_ts;
|
|
|
|
for (int cf = 0; cf != static_cast<int>(num_cfs); ++cf) {
|
|
|
|
ColumnFamilyHandle* cfh = handles_[cf];
|
|
|
|
for (size_t j = 0; j != (kNumKeysPerFile - 1) / kNumTimestamps; ++j) {
|
|
|
|
std::string value;
|
|
|
|
ASSERT_OK(db_->Get(ropts, cfh, Key1(j), &value));
|
|
|
|
ASSERT_EQ("value_" + std::to_string(j) + "_" + std::to_string(i),
|
|
|
|
value);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
};
|
|
|
|
verify_db_func();
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
2020-05-28 17:37:57 +00:00
|
|
|
TEST_P(DBBasicTestWithTimestampCompressionSettings, PutDeleteGet) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.env = env_;
|
|
|
|
options.create_if_missing = true;
|
|
|
|
const size_t kTimestampSize = Timestamp(0, 0).size();
|
|
|
|
TestComparator test_cmp(kTimestampSize);
|
|
|
|
options.comparator = &test_cmp;
|
|
|
|
const int kNumKeysPerFile = 1024;
|
2021-09-08 14:45:59 +00:00
|
|
|
options.memtable_factory.reset(
|
|
|
|
test::NewSpecialSkipListFactory(kNumKeysPerFile));
|
2020-05-28 17:37:57 +00:00
|
|
|
BlockBasedTableOptions bbto;
|
|
|
|
bbto.filter_policy = std::get<0>(GetParam());
|
|
|
|
bbto.whole_key_filtering = true;
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(bbto));
|
|
|
|
|
|
|
|
const CompressionType comp_type = std::get<1>(GetParam());
|
|
|
|
#if LZ4_VERSION_NUMBER < 10400 // r124+
|
|
|
|
if (comp_type == kLZ4Compression || comp_type == kLZ4HCCompression) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
#endif // LZ4_VERSION_NUMBER >= 10400
|
|
|
|
if (!ZSTD_Supported() && comp_type == kZSTD) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if (!Zlib_Supported() && comp_type == kZlibCompression) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
options.compression = comp_type;
|
|
|
|
options.compression_opts.max_dict_bytes = std::get<2>(GetParam());
|
|
|
|
if (comp_type == kZSTD) {
|
|
|
|
options.compression_opts.zstd_max_train_bytes = std::get<2>(GetParam());
|
|
|
|
}
|
|
|
|
options.compression_opts.parallel_threads = std::get<3>(GetParam());
|
|
|
|
options.target_file_size_base = 1 << 26; // 64MB
|
|
|
|
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
const size_t kNumL0Files =
|
|
|
|
static_cast<size_t>(Options().level0_file_num_compaction_trigger);
|
|
|
|
{
|
2021-09-27 18:49:35 +00:00
|
|
|
// Half of the keys will go through Deletion and remaining half with
|
|
|
|
// SingleDeletion. Generate enough L0 files with ts=1 to trigger compaction
|
|
|
|
// to L1
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
std::string ts = Timestamp(1, 0);
|
2020-05-28 17:37:57 +00:00
|
|
|
WriteOptions wopts;
|
2021-09-27 18:49:35 +00:00
|
|
|
for (size_t i = 0; i < kNumL0Files; ++i) {
|
|
|
|
for (int j = 0; j < kNumKeysPerFile; ++j) {
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
ASSERT_OK(db_->Put(wopts, Key1(j), ts, "value" + std::to_string(i)));
|
2020-05-28 17:37:57 +00:00
|
|
|
}
|
|
|
|
ASSERT_OK(db_->Flush(FlushOptions()));
|
|
|
|
}
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
// Generate another L0 at ts=3
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
ts = Timestamp(3, 0);
|
2021-09-27 18:49:35 +00:00
|
|
|
for (int i = 0; i < kNumKeysPerFile; ++i) {
|
2020-05-28 17:37:57 +00:00
|
|
|
std::string key_str = Key1(i);
|
|
|
|
Slice key(key_str);
|
|
|
|
if ((i % 3) == 0) {
|
2021-09-27 18:49:35 +00:00
|
|
|
if (i < kNumKeysPerFile / 2) {
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
ASSERT_OK(db_->Delete(wopts, key, ts));
|
2021-09-27 18:49:35 +00:00
|
|
|
} else {
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
ASSERT_OK(db_->SingleDelete(wopts, key, ts));
|
2021-09-27 18:49:35 +00:00
|
|
|
}
|
2020-05-28 17:37:57 +00:00
|
|
|
} else {
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
ASSERT_OK(db_->Put(wopts, key, ts, "new_value"));
|
2020-05-28 17:37:57 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
ASSERT_OK(db_->Flush(FlushOptions()));
|
|
|
|
// Populate memtable at ts=5
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
ts = Timestamp(5, 0);
|
2020-05-28 17:37:57 +00:00
|
|
|
for (int i = 0; i != kNumKeysPerFile; ++i) {
|
|
|
|
std::string key_str = Key1(i);
|
|
|
|
Slice key(key_str);
|
|
|
|
if ((i % 3) == 1) {
|
2021-09-27 18:49:35 +00:00
|
|
|
if (i < kNumKeysPerFile / 2) {
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
ASSERT_OK(db_->Delete(wopts, key, ts));
|
2021-09-27 18:49:35 +00:00
|
|
|
} else {
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
ASSERT_OK(db_->SingleDelete(wopts, key, ts));
|
2021-09-27 18:49:35 +00:00
|
|
|
}
|
2020-05-28 17:37:57 +00:00
|
|
|
} else if ((i % 3) == 2) {
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
ASSERT_OK(db_->Put(wopts, key, ts, "new_value_2"));
|
2020-05-28 17:37:57 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
{
|
|
|
|
std::string ts_str = Timestamp(6, 0);
|
|
|
|
Slice ts = ts_str;
|
|
|
|
ReadOptions ropts;
|
|
|
|
ropts.timestamp = &ts;
|
|
|
|
for (uint64_t i = 0; i != static_cast<uint64_t>(kNumKeysPerFile); ++i) {
|
|
|
|
std::string value;
|
2022-06-04 03:00:42 +00:00
|
|
|
std::string key_ts;
|
|
|
|
Status s = db_->Get(ropts, Key1(i), &value, &key_ts);
|
2020-05-28 17:37:57 +00:00
|
|
|
if ((i % 3) == 2) {
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ("new_value_2", value);
|
2022-06-04 03:00:42 +00:00
|
|
|
ASSERT_EQ(Timestamp(5, 0), key_ts);
|
|
|
|
} else if ((i % 3) == 1) {
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
ASSERT_EQ(Timestamp(5, 0), key_ts);
|
2020-05-28 17:37:57 +00:00
|
|
|
} else {
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
2022-06-04 03:00:42 +00:00
|
|
|
ASSERT_EQ(Timestamp(3, 0), key_ts);
|
2020-05-28 17:37:57 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-03-13 18:33:04 +00:00
|
|
|
// A class which remembers the name of each flushed file.
|
|
|
|
class FlushedFileCollector : public EventListener {
|
|
|
|
public:
|
2024-01-05 19:53:57 +00:00
|
|
|
FlushedFileCollector() = default;
|
|
|
|
~FlushedFileCollector() override = default;
|
2020-03-13 18:33:04 +00:00
|
|
|
|
|
|
|
void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override {
|
|
|
|
InstrumentedMutexLock lock(&mutex_);
|
|
|
|
flushed_files_.push_back(info.file_path);
|
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<std::string> GetFlushedFiles() {
|
|
|
|
std::vector<std::string> result;
|
|
|
|
{
|
|
|
|
InstrumentedMutexLock lock(&mutex_);
|
|
|
|
result = flushed_files_;
|
|
|
|
}
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
void ClearFlushedFiles() {
|
|
|
|
InstrumentedMutexLock lock(&mutex_);
|
|
|
|
flushed_files_.clear();
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
std::vector<std::string> flushed_files_;
|
|
|
|
InstrumentedMutex mutex_;
|
|
|
|
};
|
|
|
|
|
|
|
|
TEST_P(DBBasicTestWithTimestampCompressionSettings, PutAndGetWithCompaction) {
|
2020-03-17 17:47:36 +00:00
|
|
|
const int kNumKeysPerFile = 1024;
|
2020-03-13 18:33:04 +00:00
|
|
|
const size_t kNumTimestamps = 2;
|
|
|
|
const size_t kNumKeysPerTimestamp = (kNumKeysPerFile - 1) / kNumTimestamps;
|
|
|
|
const size_t kSplitPosBase = kNumKeysPerTimestamp / 2;
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.create_if_missing = true;
|
|
|
|
options.env = env_;
|
2021-09-08 14:45:59 +00:00
|
|
|
options.memtable_factory.reset(
|
|
|
|
test::NewSpecialSkipListFactory(kNumKeysPerFile));
|
2020-03-13 18:33:04 +00:00
|
|
|
|
|
|
|
FlushedFileCollector* collector = new FlushedFileCollector();
|
|
|
|
options.listeners.emplace_back(collector);
|
|
|
|
|
|
|
|
size_t ts_sz = Timestamp(0, 0).size();
|
|
|
|
TestComparator test_cmp(ts_sz);
|
|
|
|
options.comparator = &test_cmp;
|
|
|
|
BlockBasedTableOptions bbto;
|
|
|
|
bbto.filter_policy = std::get<0>(GetParam());
|
|
|
|
bbto.whole_key_filtering = true;
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(bbto));
|
|
|
|
|
|
|
|
const CompressionType comp_type = std::get<1>(GetParam());
|
|
|
|
#if LZ4_VERSION_NUMBER < 10400 // r124+
|
|
|
|
if (comp_type == kLZ4Compression || comp_type == kLZ4HCCompression) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
#endif // LZ4_VERSION_NUMBER >= 10400
|
|
|
|
if (!ZSTD_Supported() && comp_type == kZSTD) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if (!Zlib_Supported() && comp_type == kZlibCompression) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
options.compression = comp_type;
|
|
|
|
options.compression_opts.max_dict_bytes = std::get<2>(GetParam());
|
|
|
|
if (comp_type == kZSTD) {
|
|
|
|
options.compression_opts.zstd_max_train_bytes = std::get<2>(GetParam());
|
|
|
|
}
|
2020-04-01 23:37:54 +00:00
|
|
|
options.compression_opts.parallel_threads = std::get<3>(GetParam());
|
2020-03-13 18:33:04 +00:00
|
|
|
DestroyAndReopen(options);
|
|
|
|
CreateAndReopenWithCF({"pikachu"}, options);
|
|
|
|
|
|
|
|
size_t num_cfs = handles_.size();
|
|
|
|
ASSERT_EQ(2, num_cfs);
|
|
|
|
std::vector<std::string> write_ts_list;
|
|
|
|
std::vector<std::string> read_ts_list;
|
|
|
|
|
2020-03-24 18:21:10 +00:00
|
|
|
const auto& verify_records_func = [&](size_t i, size_t begin, size_t end,
|
2020-04-10 16:49:38 +00:00
|
|
|
ColumnFamilyHandle* cfh) {
|
2020-03-13 18:33:04 +00:00
|
|
|
std::string value;
|
|
|
|
std::string timestamp;
|
|
|
|
|
|
|
|
ReadOptions ropts;
|
|
|
|
const Slice read_ts = read_ts_list[i];
|
|
|
|
ropts.timestamp = &read_ts;
|
|
|
|
std::string expected_timestamp =
|
|
|
|
std::string(write_ts_list[i].data(), write_ts_list[i].size());
|
|
|
|
|
2020-03-24 18:21:10 +00:00
|
|
|
for (size_t j = begin; j <= end; ++j) {
|
|
|
|
ASSERT_OK(db_->Get(ropts, cfh, Key1(j), &value, ×tamp));
|
|
|
|
ASSERT_EQ("value_" + std::to_string(j) + "_" + std::to_string(i), value);
|
|
|
|
ASSERT_EQ(expected_timestamp, timestamp);
|
|
|
|
}
|
2020-03-13 18:33:04 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
for (size_t i = 0; i != kNumTimestamps; ++i) {
|
|
|
|
write_ts_list.push_back(Timestamp(i * 2, 0));
|
|
|
|
read_ts_list.push_back(Timestamp(1 + i * 2, 0));
|
|
|
|
const Slice write_ts = write_ts_list.back();
|
|
|
|
WriteOptions wopts;
|
|
|
|
for (int cf = 0; cf != static_cast<int>(num_cfs); ++cf) {
|
|
|
|
size_t memtable_get_start = 0;
|
|
|
|
for (size_t j = 0; j != kNumKeysPerTimestamp; ++j) {
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
ASSERT_OK(
|
|
|
|
db_->Put(wopts, handles_[cf], Key1(j), write_ts,
|
|
|
|
"value_" + std::to_string(j) + "_" + std::to_string(i)));
|
2020-03-13 18:33:04 +00:00
|
|
|
if (j == kSplitPosBase + i || j == kNumKeysPerTimestamp - 1) {
|
2020-03-24 18:21:10 +00:00
|
|
|
verify_records_func(i, memtable_get_start, j, handles_[cf]);
|
2020-03-13 18:33:04 +00:00
|
|
|
memtable_get_start = j + 1;
|
|
|
|
|
|
|
|
// flush all keys with the same timestamp to two sst files, split at
|
|
|
|
// incremental positions such that lowerlevel[1].smallest.userkey ==
|
|
|
|
// higherlevel[0].largest.userkey
|
|
|
|
ASSERT_OK(Flush(cf));
|
2021-11-06 00:56:33 +00:00
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact()); // wait for flush (which
|
|
|
|
// is also a compaction)
|
2020-03-13 18:33:04 +00:00
|
|
|
|
2020-04-10 16:49:38 +00:00
|
|
|
// compact files (2 at each level) to a lower level such that all
|
|
|
|
// keys with the same timestamp is at one level, with newer versions
|
|
|
|
// at higher levels.
|
2020-03-13 18:33:04 +00:00
|
|
|
CompactionOptions compact_opt;
|
|
|
|
compact_opt.compression = kNoCompression;
|
2020-09-30 01:21:49 +00:00
|
|
|
ASSERT_OK(db_->CompactFiles(compact_opt, handles_[cf],
|
|
|
|
collector->GetFlushedFiles(),
|
|
|
|
static_cast<int>(kNumTimestamps - i)));
|
2020-03-13 18:33:04 +00:00
|
|
|
collector->ClearFlushedFiles();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
const auto& verify_db_func = [&]() {
|
|
|
|
for (size_t i = 0; i != kNumTimestamps; ++i) {
|
|
|
|
ReadOptions ropts;
|
|
|
|
const Slice read_ts = read_ts_list[i];
|
|
|
|
ropts.timestamp = &read_ts;
|
|
|
|
std::string expected_timestamp(write_ts_list[i].data(),
|
|
|
|
write_ts_list[i].size());
|
|
|
|
for (int cf = 0; cf != static_cast<int>(num_cfs); ++cf) {
|
|
|
|
ColumnFamilyHandle* cfh = handles_[cf];
|
2020-03-24 18:21:10 +00:00
|
|
|
verify_records_func(i, 0, kNumKeysPerTimestamp - 1, cfh);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
};
|
|
|
|
verify_db_func();
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBBasicTestWithTimestamp, BatchWriteAndMultiGet) {
|
|
|
|
const int kNumKeysPerFile = 8192;
|
|
|
|
const size_t kNumTimestamps = 2;
|
|
|
|
const size_t kNumKeysPerTimestamp = (kNumKeysPerFile - 1) / kNumTimestamps;
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.create_if_missing = true;
|
|
|
|
options.env = env_;
|
2021-09-08 14:45:59 +00:00
|
|
|
options.memtable_factory.reset(
|
|
|
|
test::NewSpecialSkipListFactory(kNumKeysPerFile));
|
2020-12-01 22:05:19 +00:00
|
|
|
options.memtable_prefix_bloom_size_ratio = 0.1;
|
|
|
|
options.memtable_whole_key_filtering = true;
|
2020-03-24 18:21:10 +00:00
|
|
|
|
|
|
|
size_t ts_sz = Timestamp(0, 0).size();
|
|
|
|
TestComparator test_cmp(ts_sz);
|
|
|
|
options.comparator = &test_cmp;
|
|
|
|
BlockBasedTableOptions bbto;
|
|
|
|
bbto.filter_policy.reset(NewBloomFilterPolicy(
|
|
|
|
10 /*bits_per_key*/, false /*use_block_based_builder*/));
|
|
|
|
bbto.whole_key_filtering = true;
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(bbto));
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
CreateAndReopenWithCF({"pikachu"}, options);
|
|
|
|
size_t num_cfs = handles_.size();
|
|
|
|
ASSERT_EQ(2, num_cfs);
|
|
|
|
std::vector<std::string> write_ts_list;
|
|
|
|
std::vector<std::string> read_ts_list;
|
|
|
|
|
|
|
|
const auto& verify_records_func = [&](size_t i, ColumnFamilyHandle* cfh) {
|
|
|
|
std::vector<Slice> keys;
|
|
|
|
std::vector<std::string> key_vals;
|
|
|
|
std::vector<std::string> values;
|
|
|
|
std::vector<std::string> timestamps;
|
|
|
|
|
|
|
|
for (size_t j = 0; j != kNumKeysPerTimestamp; ++j) {
|
|
|
|
key_vals.push_back(Key1(j));
|
|
|
|
}
|
|
|
|
for (size_t j = 0; j != kNumKeysPerTimestamp; ++j) {
|
2024-01-05 19:53:57 +00:00
|
|
|
keys.emplace_back(key_vals[j]);
|
2020-03-24 18:21:10 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
ReadOptions ropts;
|
|
|
|
const Slice read_ts = read_ts_list[i];
|
|
|
|
ropts.timestamp = &read_ts;
|
|
|
|
std::string expected_timestamp(write_ts_list[i].data(),
|
|
|
|
write_ts_list[i].size());
|
|
|
|
|
|
|
|
std::vector<ColumnFamilyHandle*> cfhs(keys.size(), cfh);
|
|
|
|
std::vector<Status> statuses =
|
|
|
|
db_->MultiGet(ropts, cfhs, keys, &values, ×tamps);
|
|
|
|
for (size_t j = 0; j != kNumKeysPerTimestamp; ++j) {
|
|
|
|
ASSERT_OK(statuses[j]);
|
|
|
|
ASSERT_EQ("value_" + std::to_string(j) + "_" + std::to_string(i),
|
|
|
|
values[j]);
|
|
|
|
ASSERT_EQ(expected_timestamp, timestamps[j]);
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2021-09-12 22:33:15 +00:00
|
|
|
const std::string dummy_ts(ts_sz, '\0');
|
2020-03-24 18:21:10 +00:00
|
|
|
for (size_t i = 0; i != kNumTimestamps; ++i) {
|
|
|
|
write_ts_list.push_back(Timestamp(i * 2, 0));
|
|
|
|
read_ts_list.push_back(Timestamp(1 + i * 2, 0));
|
|
|
|
const Slice& write_ts = write_ts_list.back();
|
|
|
|
for (int cf = 0; cf != static_cast<int>(num_cfs); ++cf) {
|
|
|
|
WriteOptions wopts;
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
WriteBatch batch(0, 0, 0, ts_sz);
|
2020-03-24 18:21:10 +00:00
|
|
|
for (size_t j = 0; j != kNumKeysPerTimestamp; ++j) {
|
2021-09-12 22:33:15 +00:00
|
|
|
const std::string key = Key1(j);
|
|
|
|
const std::string value =
|
|
|
|
"value_" + std::to_string(j) + "_" + std::to_string(i);
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
ASSERT_OK(batch.Put(handles_[cf], key, value));
|
2020-03-24 18:21:10 +00:00
|
|
|
}
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
ASSERT_OK(batch.UpdateTimestamps(write_ts,
|
|
|
|
[ts_sz](uint32_t) { return ts_sz; }));
|
2020-03-24 18:21:10 +00:00
|
|
|
ASSERT_OK(db_->Write(wopts, &batch));
|
|
|
|
|
|
|
|
verify_records_func(i, handles_[cf]);
|
|
|
|
|
|
|
|
ASSERT_OK(Flush(cf));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
const auto& verify_db_func = [&]() {
|
|
|
|
for (size_t i = 0; i != kNumTimestamps; ++i) {
|
|
|
|
ReadOptions ropts;
|
|
|
|
const Slice read_ts = read_ts_list[i];
|
|
|
|
ropts.timestamp = &read_ts;
|
|
|
|
for (int cf = 0; cf != static_cast<int>(num_cfs); ++cf) {
|
|
|
|
ColumnFamilyHandle* cfh = handles_[cf];
|
|
|
|
verify_records_func(i, cfh);
|
2020-03-13 18:33:04 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
};
|
|
|
|
verify_db_func();
|
|
|
|
Close();
|
|
|
|
}
|
2020-03-24 18:21:10 +00:00
|
|
|
|
2020-04-28 05:47:44 +00:00
|
|
|
TEST_F(DBBasicTestWithTimestamp, MultiGetNoReturnTs) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.env = env_;
|
|
|
|
const size_t kTimestampSize = Timestamp(0, 0).size();
|
|
|
|
TestComparator test_cmp(kTimestampSize);
|
|
|
|
options.comparator = &test_cmp;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
WriteOptions write_opts;
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
std::string ts = Timestamp(1, 0);
|
|
|
|
ASSERT_OK(db_->Put(write_opts, "foo", ts, "value"));
|
|
|
|
ASSERT_OK(db_->Put(write_opts, "bar", ts, "value"));
|
|
|
|
ASSERT_OK(db_->Put(write_opts, "fooxxxxxxxxxxxxxxxx", ts, "value"));
|
|
|
|
ASSERT_OK(db_->Put(write_opts, "barxxxxxxxxxxxxxxxx", ts, "value"));
|
2020-04-28 05:47:44 +00:00
|
|
|
ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily();
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
ts = Timestamp(2, 0);
|
|
|
|
Slice read_ts = ts;
|
2020-04-28 05:47:44 +00:00
|
|
|
ReadOptions read_opts;
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
read_opts.timestamp = &read_ts;
|
2020-04-28 05:47:44 +00:00
|
|
|
{
|
|
|
|
ColumnFamilyHandle* column_families[] = {cfh, cfh};
|
|
|
|
Slice keys[] = {"foo", "bar"};
|
|
|
|
PinnableSlice values[] = {PinnableSlice(), PinnableSlice()};
|
|
|
|
Status statuses[] = {Status::OK(), Status::OK()};
|
|
|
|
dbfull()->MultiGet(read_opts, /*num_keys=*/2, &column_families[0], &keys[0],
|
|
|
|
&values[0], &statuses[0], /*sorted_input=*/false);
|
|
|
|
for (const auto& s : statuses) {
|
|
|
|
ASSERT_OK(s);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
{
|
|
|
|
ColumnFamilyHandle* column_families[] = {cfh, cfh, cfh, cfh};
|
|
|
|
// Make user keys longer than configured timestamp size (16 bytes) to
|
|
|
|
// verify RocksDB does not use the trailing bytes 'x' as timestamp.
|
|
|
|
Slice keys[] = {"fooxxxxxxxxxxxxxxxx", "barxxxxxxxxxxxxxxxx", "foo", "bar"};
|
|
|
|
PinnableSlice values[] = {PinnableSlice(), PinnableSlice(), PinnableSlice(),
|
|
|
|
PinnableSlice()};
|
|
|
|
Status statuses[] = {Status::OK(), Status::OK(), Status::OK(),
|
|
|
|
Status::OK()};
|
|
|
|
dbfull()->MultiGet(read_opts, /*num_keys=*/4, &column_families[0], &keys[0],
|
|
|
|
&values[0], &statuses[0], /*sorted_input=*/false);
|
|
|
|
for (const auto& s : statuses) {
|
|
|
|
ASSERT_OK(s);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
2020-03-13 18:33:04 +00:00
|
|
|
|
2020-06-03 22:53:09 +00:00
|
|
|
INSTANTIATE_TEST_CASE_P(
|
2020-03-13 18:33:04 +00:00
|
|
|
Timestamp, DBBasicTestWithTimestampCompressionSettings,
|
|
|
|
::testing::Combine(
|
|
|
|
::testing::Values(std::shared_ptr<const FilterPolicy>(nullptr),
|
|
|
|
std::shared_ptr<const FilterPolicy>(
|
|
|
|
NewBloomFilterPolicy(10, false))),
|
|
|
|
::testing::Values(kNoCompression, kZlibCompression, kLZ4Compression,
|
|
|
|
kLZ4HCCompression, kZSTD),
|
2020-04-01 23:37:54 +00:00
|
|
|
::testing::Values(0, 1 << 14), ::testing::Values(1, 4)));
|
2020-03-13 18:33:04 +00:00
|
|
|
|
|
|
|
class DBBasicTestWithTimestampPrefixSeek
|
|
|
|
: public DBBasicTestWithTimestampBase,
|
|
|
|
public testing::WithParamInterface<
|
|
|
|
std::tuple<std::shared_ptr<const SliceTransform>,
|
Fix a bug in key comparison when index type is kBinarySearchWithFirstKey (#8062)
Summary:
When timestamp is enabled, key comparison should take this into account.
In `BlockBasedTableReader::Get()`, `BlockBasedTableReader::MultiGet()`,
assume the target key is `key`, and the timestamp upper bound is `ts`.
The highest key in current block is (key, ts1), while the lowest key in next
block is (key, ts2).
If
```
ts1 > ts > ts2
```
then
```
(key, ts1) < (key, ts) < (key, ts2)
```
It can be shown that if `Compare()` is used, then we will mistakenly skip the next
block. Instead, we should use `CompareWithoutTimestamp()`.
The majority of this PR makes some existing tests in `db_with_timestamp_basic_test.cc`
parameterized so that different index types can be tested. A new unit test is
also added for more coverage.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8062
Test Plan: make check
Reviewed By: ltamasi
Differential Revision: D27057557
Pulled By: riversand963
fbshipit-source-id: c1062fa7c159ed600a1ad7e461531d52265021f1
2021-03-16 00:43:02 +00:00
|
|
|
std::shared_ptr<const FilterPolicy>, bool,
|
|
|
|
BlockBasedTableOptions::IndexType>> {
|
2020-03-13 18:33:04 +00:00
|
|
|
public:
|
|
|
|
DBBasicTestWithTimestampPrefixSeek()
|
|
|
|
: DBBasicTestWithTimestampBase(
|
|
|
|
"/db_basic_test_with_timestamp_prefix_seek") {}
|
|
|
|
};
|
|
|
|
|
2021-03-10 19:13:55 +00:00
|
|
|
TEST_P(DBBasicTestWithTimestampPrefixSeek, IterateWithPrefix) {
|
2020-03-17 17:47:36 +00:00
|
|
|
const size_t kNumKeysPerFile = 128;
|
2020-03-13 18:33:04 +00:00
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.env = env_;
|
|
|
|
options.create_if_missing = true;
|
|
|
|
const size_t kTimestampSize = Timestamp(0, 0).size();
|
|
|
|
TestComparator test_cmp(kTimestampSize);
|
|
|
|
options.comparator = &test_cmp;
|
|
|
|
options.prefix_extractor = std::get<0>(GetParam());
|
2021-09-08 14:45:59 +00:00
|
|
|
options.memtable_factory.reset(
|
|
|
|
test::NewSpecialSkipListFactory(kNumKeysPerFile));
|
2020-03-13 18:33:04 +00:00
|
|
|
BlockBasedTableOptions bbto;
|
|
|
|
bbto.filter_policy = std::get<1>(GetParam());
|
Fix a bug in key comparison when index type is kBinarySearchWithFirstKey (#8062)
Summary:
When timestamp is enabled, key comparison should take this into account.
In `BlockBasedTableReader::Get()`, `BlockBasedTableReader::MultiGet()`,
assume the target key is `key`, and the timestamp upper bound is `ts`.
The highest key in current block is (key, ts1), while the lowest key in next
block is (key, ts2).
If
```
ts1 > ts > ts2
```
then
```
(key, ts1) < (key, ts) < (key, ts2)
```
It can be shown that if `Compare()` is used, then we will mistakenly skip the next
block. Instead, we should use `CompareWithoutTimestamp()`.
The majority of this PR makes some existing tests in `db_with_timestamp_basic_test.cc`
parameterized so that different index types can be tested. A new unit test is
also added for more coverage.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8062
Test Plan: make check
Reviewed By: ltamasi
Differential Revision: D27057557
Pulled By: riversand963
fbshipit-source-id: c1062fa7c159ed600a1ad7e461531d52265021f1
2021-03-16 00:43:02 +00:00
|
|
|
bbto.index_type = std::get<3>(GetParam());
|
2020-03-13 18:33:04 +00:00
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(bbto));
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
const uint64_t kMaxKey = 0xffffffffffffffff;
|
2020-03-17 17:47:36 +00:00
|
|
|
const uint64_t kMinKey = 0xfffffffffffff000;
|
2020-03-13 18:33:04 +00:00
|
|
|
const std::vector<std::string> write_ts_list = {Timestamp(3, 0xffffffff),
|
|
|
|
Timestamp(6, 0xffffffff)};
|
|
|
|
WriteOptions write_opts;
|
|
|
|
{
|
|
|
|
for (size_t i = 0; i != write_ts_list.size(); ++i) {
|
2020-05-28 17:37:57 +00:00
|
|
|
for (uint64_t key = kMaxKey; key >= kMinKey; --key) {
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
Status s = db_->Put(write_opts, Key1(key), write_ts_list[i],
|
|
|
|
"value" + std::to_string(i));
|
2020-03-13 18:33:04 +00:00
|
|
|
ASSERT_OK(s);
|
2020-05-28 17:37:57 +00:00
|
|
|
}
|
2020-03-13 18:33:04 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
const std::vector<std::string> read_ts_list = {Timestamp(5, 0xffffffff),
|
|
|
|
Timestamp(9, 0xffffffff)};
|
|
|
|
{
|
|
|
|
ReadOptions read_opts;
|
|
|
|
read_opts.total_order_seek = false;
|
|
|
|
read_opts.prefix_same_as_start = std::get<2>(GetParam());
|
|
|
|
fprintf(stdout, "%s %s %d\n", options.prefix_extractor->Name(),
|
|
|
|
bbto.filter_policy ? bbto.filter_policy->Name() : "null",
|
|
|
|
static_cast<int>(read_opts.prefix_same_as_start));
|
|
|
|
for (size_t i = 0; i != read_ts_list.size(); ++i) {
|
|
|
|
Slice read_ts = read_ts_list[i];
|
|
|
|
read_opts.timestamp = &read_ts;
|
|
|
|
std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
|
|
|
|
|
|
|
|
// Seek to kMaxKey
|
|
|
|
iter->Seek(Key1(kMaxKey));
|
2020-08-05 01:50:05 +00:00
|
|
|
CheckIterUserEntry(iter.get(), Key1(kMaxKey), kTypeValue,
|
|
|
|
"value" + std::to_string(i), write_ts_list[i]);
|
2020-03-13 18:33:04 +00:00
|
|
|
iter->Next();
|
|
|
|
ASSERT_FALSE(iter->Valid());
|
2023-10-18 16:38:38 +00:00
|
|
|
ASSERT_OK(iter->status());
|
2021-03-10 19:13:55 +00:00
|
|
|
|
|
|
|
// Seek to kMinKey
|
|
|
|
iter->Seek(Key1(kMinKey));
|
|
|
|
CheckIterUserEntry(iter.get(), Key1(kMinKey), kTypeValue,
|
|
|
|
"value" + std::to_string(i), write_ts_list[i]);
|
|
|
|
iter->Prev();
|
|
|
|
ASSERT_FALSE(iter->Valid());
|
2023-10-18 16:38:38 +00:00
|
|
|
ASSERT_OK(iter->status());
|
2020-03-13 18:33:04 +00:00
|
|
|
}
|
|
|
|
const std::vector<uint64_t> targets = {kMinKey, kMinKey + 0x10,
|
|
|
|
kMinKey + 0x100, kMaxKey};
|
|
|
|
const SliceTransform* const pe = options.prefix_extractor.get();
|
|
|
|
ASSERT_NE(nullptr, pe);
|
|
|
|
const size_t kPrefixShift =
|
|
|
|
8 * (Key1(0).size() - pe->Transform(Key1(0)).size());
|
|
|
|
const uint64_t kPrefixMask =
|
|
|
|
~((static_cast<uint64_t>(1) << kPrefixShift) - 1);
|
|
|
|
const uint64_t kNumKeysWithinPrefix =
|
|
|
|
(static_cast<uint64_t>(1) << kPrefixShift);
|
|
|
|
for (size_t i = 0; i != read_ts_list.size(); ++i) {
|
|
|
|
Slice read_ts = read_ts_list[i];
|
|
|
|
read_opts.timestamp = &read_ts;
|
|
|
|
std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
|
2021-03-10 19:13:55 +00:00
|
|
|
// Forward and backward iterate.
|
2020-03-13 18:33:04 +00:00
|
|
|
for (size_t j = 0; j != targets.size(); ++j) {
|
|
|
|
std::string start_key = Key1(targets[j]);
|
|
|
|
uint64_t expected_ub =
|
|
|
|
(targets[j] & kPrefixMask) - 1 + kNumKeysWithinPrefix;
|
|
|
|
uint64_t expected_key = targets[j];
|
|
|
|
size_t count = 0;
|
|
|
|
it->Seek(Key1(targets[j]));
|
|
|
|
while (it->Valid()) {
|
|
|
|
std::string saved_prev_key;
|
|
|
|
saved_prev_key.assign(it->key().data(), it->key().size());
|
|
|
|
|
|
|
|
// Out of prefix
|
|
|
|
if (!read_opts.prefix_same_as_start &&
|
|
|
|
pe->Transform(saved_prev_key) != pe->Transform(start_key)) {
|
|
|
|
break;
|
|
|
|
}
|
2020-08-05 01:50:05 +00:00
|
|
|
CheckIterUserEntry(it.get(), Key1(expected_key), kTypeValue,
|
2020-03-13 18:33:04 +00:00
|
|
|
"value" + std::to_string(i), write_ts_list[i]);
|
|
|
|
++count;
|
|
|
|
++expected_key;
|
|
|
|
it->Next();
|
|
|
|
}
|
2023-10-18 16:38:38 +00:00
|
|
|
ASSERT_OK(it->status());
|
2020-03-13 18:33:04 +00:00
|
|
|
ASSERT_EQ(expected_ub - targets[j] + 1, count);
|
2021-03-10 19:13:55 +00:00
|
|
|
|
|
|
|
count = 0;
|
|
|
|
expected_key = targets[j];
|
|
|
|
it->SeekForPrev(start_key);
|
|
|
|
uint64_t expected_lb = (targets[j] & kPrefixMask);
|
|
|
|
while (it->Valid()) {
|
|
|
|
// Out of prefix
|
|
|
|
if (!read_opts.prefix_same_as_start &&
|
|
|
|
pe->Transform(it->key()) != pe->Transform(start_key)) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
CheckIterUserEntry(it.get(), Key1(expected_key), kTypeValue,
|
|
|
|
"value" + std::to_string(i), write_ts_list[i]);
|
|
|
|
++count;
|
|
|
|
--expected_key;
|
|
|
|
it->Prev();
|
|
|
|
}
|
2023-10-18 16:38:38 +00:00
|
|
|
ASSERT_OK(it->status());
|
2021-03-10 19:13:55 +00:00
|
|
|
ASSERT_EQ(targets[j] - std::max(expected_lb, kMinKey) + 1, count);
|
2020-03-13 18:33:04 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
|
|
|
// TODO(yanqin): consider handling non-fixed-length prefix extractors, e.g.
|
|
|
|
// NoopTransform.
|
2020-06-03 22:53:09 +00:00
|
|
|
INSTANTIATE_TEST_CASE_P(
|
2020-03-13 18:33:04 +00:00
|
|
|
Timestamp, DBBasicTestWithTimestampPrefixSeek,
|
|
|
|
::testing::Combine(
|
|
|
|
::testing::Values(
|
2020-12-01 22:05:19 +00:00
|
|
|
std::shared_ptr<const SliceTransform>(NewFixedPrefixTransform(1)),
|
2020-03-13 18:33:04 +00:00
|
|
|
std::shared_ptr<const SliceTransform>(NewFixedPrefixTransform(4)),
|
|
|
|
std::shared_ptr<const SliceTransform>(NewFixedPrefixTransform(7)),
|
|
|
|
std::shared_ptr<const SliceTransform>(NewFixedPrefixTransform(8))),
|
|
|
|
::testing::Values(std::shared_ptr<const FilterPolicy>(nullptr),
|
|
|
|
std::shared_ptr<const FilterPolicy>(
|
|
|
|
NewBloomFilterPolicy(10 /*bits_per_key*/, false)),
|
|
|
|
std::shared_ptr<const FilterPolicy>(
|
|
|
|
NewBloomFilterPolicy(20 /*bits_per_key*/,
|
|
|
|
false))),
|
Fix a bug in key comparison when index type is kBinarySearchWithFirstKey (#8062)
Summary:
When timestamp is enabled, key comparison should take this into account.
In `BlockBasedTableReader::Get()`, `BlockBasedTableReader::MultiGet()`,
assume the target key is `key`, and the timestamp upper bound is `ts`.
The highest key in current block is (key, ts1), while the lowest key in next
block is (key, ts2).
If
```
ts1 > ts > ts2
```
then
```
(key, ts1) < (key, ts) < (key, ts2)
```
It can be shown that if `Compare()` is used, then we will mistakenly skip the next
block. Instead, we should use `CompareWithoutTimestamp()`.
The majority of this PR makes some existing tests in `db_with_timestamp_basic_test.cc`
parameterized so that different index types can be tested. A new unit test is
also added for more coverage.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8062
Test Plan: make check
Reviewed By: ltamasi
Differential Revision: D27057557
Pulled By: riversand963
fbshipit-source-id: c1062fa7c159ed600a1ad7e461531d52265021f1
2021-03-16 00:43:02 +00:00
|
|
|
::testing::Bool(),
|
|
|
|
::testing::Values(
|
|
|
|
BlockBasedTableOptions::IndexType::kBinarySearch,
|
|
|
|
BlockBasedTableOptions::IndexType::kHashSearch,
|
|
|
|
BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch,
|
|
|
|
BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey)));
|
2020-03-13 18:33:04 +00:00
|
|
|
|
2020-05-28 17:37:57 +00:00
|
|
|
class DBBasicTestWithTsIterTombstones
|
|
|
|
: public DBBasicTestWithTimestampBase,
|
|
|
|
public testing::WithParamInterface<
|
|
|
|
std::tuple<std::shared_ptr<const SliceTransform>,
|
Fix a bug in key comparison when index type is kBinarySearchWithFirstKey (#8062)
Summary:
When timestamp is enabled, key comparison should take this into account.
In `BlockBasedTableReader::Get()`, `BlockBasedTableReader::MultiGet()`,
assume the target key is `key`, and the timestamp upper bound is `ts`.
The highest key in current block is (key, ts1), while the lowest key in next
block is (key, ts2).
If
```
ts1 > ts > ts2
```
then
```
(key, ts1) < (key, ts) < (key, ts2)
```
It can be shown that if `Compare()` is used, then we will mistakenly skip the next
block. Instead, we should use `CompareWithoutTimestamp()`.
The majority of this PR makes some existing tests in `db_with_timestamp_basic_test.cc`
parameterized so that different index types can be tested. A new unit test is
also added for more coverage.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8062
Test Plan: make check
Reviewed By: ltamasi
Differential Revision: D27057557
Pulled By: riversand963
fbshipit-source-id: c1062fa7c159ed600a1ad7e461531d52265021f1
2021-03-16 00:43:02 +00:00
|
|
|
std::shared_ptr<const FilterPolicy>, int,
|
|
|
|
BlockBasedTableOptions::IndexType>> {
|
2020-05-28 17:37:57 +00:00
|
|
|
public:
|
|
|
|
DBBasicTestWithTsIterTombstones()
|
|
|
|
: DBBasicTestWithTimestampBase("/db_basic_ts_iter_tombstones") {}
|
|
|
|
};
|
|
|
|
|
2021-03-10 19:13:55 +00:00
|
|
|
TEST_P(DBBasicTestWithTsIterTombstones, IterWithDelete) {
|
2020-05-28 17:37:57 +00:00
|
|
|
constexpr size_t kNumKeysPerFile = 128;
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.env = env_;
|
|
|
|
const size_t kTimestampSize = Timestamp(0, 0).size();
|
|
|
|
TestComparator test_cmp(kTimestampSize);
|
|
|
|
options.comparator = &test_cmp;
|
|
|
|
options.prefix_extractor = std::get<0>(GetParam());
|
2021-09-08 14:45:59 +00:00
|
|
|
options.memtable_factory.reset(
|
|
|
|
test::NewSpecialSkipListFactory(kNumKeysPerFile));
|
2020-05-28 17:37:57 +00:00
|
|
|
BlockBasedTableOptions bbto;
|
|
|
|
bbto.filter_policy = std::get<1>(GetParam());
|
Fix a bug in key comparison when index type is kBinarySearchWithFirstKey (#8062)
Summary:
When timestamp is enabled, key comparison should take this into account.
In `BlockBasedTableReader::Get()`, `BlockBasedTableReader::MultiGet()`,
assume the target key is `key`, and the timestamp upper bound is `ts`.
The highest key in current block is (key, ts1), while the lowest key in next
block is (key, ts2).
If
```
ts1 > ts > ts2
```
then
```
(key, ts1) < (key, ts) < (key, ts2)
```
It can be shown that if `Compare()` is used, then we will mistakenly skip the next
block. Instead, we should use `CompareWithoutTimestamp()`.
The majority of this PR makes some existing tests in `db_with_timestamp_basic_test.cc`
parameterized so that different index types can be tested. A new unit test is
also added for more coverage.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8062
Test Plan: make check
Reviewed By: ltamasi
Differential Revision: D27057557
Pulled By: riversand963
fbshipit-source-id: c1062fa7c159ed600a1ad7e461531d52265021f1
2021-03-16 00:43:02 +00:00
|
|
|
bbto.index_type = std::get<3>(GetParam());
|
2020-05-28 17:37:57 +00:00
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(bbto));
|
|
|
|
options.num_levels = std::get<2>(GetParam());
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
std::vector<std::string> write_ts_strs = {Timestamp(2, 0), Timestamp(4, 0)};
|
|
|
|
constexpr uint64_t kMaxKey = 0xffffffffffffffff;
|
|
|
|
constexpr uint64_t kMinKey = 0xfffffffffffff000;
|
|
|
|
// Insert kMinKey...kMaxKey
|
|
|
|
uint64_t key = kMinKey;
|
|
|
|
WriteOptions write_opts;
|
|
|
|
Slice ts = write_ts_strs[0];
|
|
|
|
do {
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
Status s = db_->Put(write_opts, Key1(key), write_ts_strs[0],
|
|
|
|
"value" + std::to_string(key));
|
2020-05-28 17:37:57 +00:00
|
|
|
ASSERT_OK(s);
|
|
|
|
if (kMaxKey == key) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
++key;
|
|
|
|
} while (true);
|
2021-03-10 19:13:55 +00:00
|
|
|
|
2020-05-28 17:37:57 +00:00
|
|
|
for (key = kMaxKey; key >= kMinKey; --key) {
|
|
|
|
Status s;
|
|
|
|
if (0 != (key % 2)) {
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
s = db_->Put(write_opts, Key1(key), write_ts_strs[1],
|
|
|
|
"value1" + std::to_string(key));
|
2020-05-28 17:37:57 +00:00
|
|
|
} else {
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
s = db_->Delete(write_opts, Key1(key), write_ts_strs[1]);
|
2020-05-28 17:37:57 +00:00
|
|
|
}
|
|
|
|
ASSERT_OK(s);
|
|
|
|
}
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
|
|
|
{
|
|
|
|
std::string read_ts = Timestamp(4, 0);
|
|
|
|
ts = read_ts;
|
|
|
|
ReadOptions read_opts;
|
|
|
|
read_opts.total_order_seek = true;
|
|
|
|
read_opts.timestamp = &ts;
|
|
|
|
std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
|
|
|
|
size_t count = 0;
|
|
|
|
key = kMinKey + 1;
|
|
|
|
for (iter->SeekToFirst(); iter->Valid(); iter->Next(), ++count, key += 2) {
|
|
|
|
ASSERT_EQ(Key1(key), iter->key());
|
|
|
|
ASSERT_EQ("value1" + std::to_string(key), iter->value());
|
|
|
|
}
|
|
|
|
ASSERT_EQ((kMaxKey - kMinKey + 1) / 2, count);
|
2021-03-10 19:13:55 +00:00
|
|
|
|
|
|
|
for (iter->SeekToLast(), count = 0, key = kMaxKey; iter->Valid();
|
|
|
|
key -= 2, ++count, iter->Prev()) {
|
|
|
|
ASSERT_EQ(Key1(key), iter->key());
|
|
|
|
ASSERT_EQ("value1" + std::to_string(key), iter->value());
|
|
|
|
}
|
2023-10-18 16:38:38 +00:00
|
|
|
ASSERT_OK(iter->status());
|
2021-03-10 19:13:55 +00:00
|
|
|
ASSERT_EQ((kMaxKey - kMinKey + 1) / 2, count);
|
2020-05-28 17:37:57 +00:00
|
|
|
}
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
2020-06-03 22:53:09 +00:00
|
|
|
INSTANTIATE_TEST_CASE_P(
|
2020-05-28 17:37:57 +00:00
|
|
|
Timestamp, DBBasicTestWithTsIterTombstones,
|
|
|
|
::testing::Combine(
|
|
|
|
::testing::Values(
|
|
|
|
std::shared_ptr<const SliceTransform>(NewFixedPrefixTransform(7)),
|
|
|
|
std::shared_ptr<const SliceTransform>(NewFixedPrefixTransform(8))),
|
|
|
|
::testing::Values(std::shared_ptr<const FilterPolicy>(nullptr),
|
|
|
|
std::shared_ptr<const FilterPolicy>(
|
|
|
|
NewBloomFilterPolicy(10, false)),
|
|
|
|
std::shared_ptr<const FilterPolicy>(
|
|
|
|
NewBloomFilterPolicy(20, false))),
|
Fix a bug in key comparison when index type is kBinarySearchWithFirstKey (#8062)
Summary:
When timestamp is enabled, key comparison should take this into account.
In `BlockBasedTableReader::Get()`, `BlockBasedTableReader::MultiGet()`,
assume the target key is `key`, and the timestamp upper bound is `ts`.
The highest key in current block is (key, ts1), while the lowest key in next
block is (key, ts2).
If
```
ts1 > ts > ts2
```
then
```
(key, ts1) < (key, ts) < (key, ts2)
```
It can be shown that if `Compare()` is used, then we will mistakenly skip the next
block. Instead, we should use `CompareWithoutTimestamp()`.
The majority of this PR makes some existing tests in `db_with_timestamp_basic_test.cc`
parameterized so that different index types can be tested. A new unit test is
also added for more coverage.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8062
Test Plan: make check
Reviewed By: ltamasi
Differential Revision: D27057557
Pulled By: riversand963
fbshipit-source-id: c1062fa7c159ed600a1ad7e461531d52265021f1
2021-03-16 00:43:02 +00:00
|
|
|
::testing::Values(2, 6),
|
|
|
|
::testing::Values(
|
|
|
|
BlockBasedTableOptions::IndexType::kBinarySearch,
|
|
|
|
BlockBasedTableOptions::IndexType::kHashSearch,
|
|
|
|
BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch,
|
|
|
|
BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey)));
|
2021-07-07 18:13:09 +00:00
|
|
|
#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
|
2020-05-28 17:37:57 +00:00
|
|
|
|
2022-06-10 15:21:08 +00:00
|
|
|
class UpdateFullHistoryTsLowTest : public DBBasicTestWithTimestampBase {
|
|
|
|
public:
|
|
|
|
UpdateFullHistoryTsLowTest()
|
|
|
|
: DBBasicTestWithTimestampBase("/update_full_history_ts_low_test") {}
|
|
|
|
};
|
|
|
|
|
|
|
|
TEST_F(UpdateFullHistoryTsLowTest, ConcurrentUpdate) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.env = env_;
|
|
|
|
options.create_if_missing = true;
|
|
|
|
std::string lower_ts_low = Timestamp(10, 0);
|
|
|
|
std::string higher_ts_low = Timestamp(25, 0);
|
|
|
|
const size_t kTimestampSize = lower_ts_low.size();
|
|
|
|
TestComparator test_cmp(kTimestampSize);
|
|
|
|
options.comparator = &test_cmp;
|
|
|
|
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
SyncPoint::GetInstance()->ClearAllCallBacks();
|
|
|
|
// This workaround swaps `lower_ts_low` originally used for update by the
|
|
|
|
// caller to `higher_ts_low` after its writer is queued to make sure
|
|
|
|
// the caller will always get a TryAgain error.
|
|
|
|
// It mimics cases where two threads update full_history_ts_low concurrently
|
|
|
|
// with one thread writing a higher ts_low and one thread writing a lower
|
|
|
|
// ts_low.
|
|
|
|
VersionEdit* version_edit;
|
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"DBImpl::IncreaseFullHistoryTsLowImpl:BeforeEdit",
|
Prefer static_cast in place of most reinterpret_cast (#12308)
Summary:
The following are risks associated with pointer-to-pointer reinterpret_cast:
* Can produce the "wrong result" (crash or memory corruption). IIRC, in theory this can happen for any up-cast or down-cast for a non-standard-layout type, though in practice would only happen for multiple inheritance cases (where the base class pointer might be "inside" the derived object). We don't use multiple inheritance a lot, but we do.
* Can mask useful compiler errors upon code change, including converting between unrelated pointer types that you are expecting to be related, and converting between pointer and scalar types unintentionally.
I can only think of some obscure cases where static_cast could be troublesome when it compiles as a replacement:
* Going through `void*` could plausibly cause unnecessary or broken pointer arithmetic. Suppose we have
`struct Derived: public Base1, public Base2`. If we have `Derived*` -> `void*` -> `Base2*` -> `Derived*` through reinterpret casts, this could plausibly work (though technical UB) assuming the `Base2*` is not dereferenced. Changing to static cast could introduce breaking pointer arithmetic.
* Unnecessary (but safe) pointer arithmetic could arise in a case like `Derived*` -> `Base2*` -> `Derived*` where before the Base2 pointer might not have been dereferenced. This could potentially affect performance.
With some light scripting, I tried replacing pointer-to-pointer reinterpret_casts with static_cast and kept the cases that still compile. Most occurrences of reinterpret_cast have successfully been changed (except for java/ and third-party/). 294 changed, 257 remain.
A couple of related interventions included here:
* Previously Cache::Handle was not actually derived from in the implementations and just used as a `void*` stand-in with reinterpret_cast. Now there is a relationship to allow static_cast. In theory, this could introduce pointer arithmetic (as described above) but is unlikely without multiple inheritance AND non-empty Cache::Handle.
* Remove some unnecessary casts to void* as this is allowed to be implicit (for better or worse).
Most of the remaining reinterpret_casts are for converting to/from raw bytes of objects. We could consider better idioms for these patterns in follow-up work.
I wish there were a way to implement a template variant of static_cast that would only compile if no pointer arithmetic is generated, but best I can tell, this is not possible. AFAIK the best you could do is a dynamic check that the void* conversion after the static cast is unchanged.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12308
Test Plan: existing tests, CI
Reviewed By: ltamasi
Differential Revision: D53204947
Pulled By: pdillinger
fbshipit-source-id: 9de23e618263b0d5b9820f4e15966876888a16e2
2024-02-07 18:44:11 +00:00
|
|
|
[&](void* arg) { version_edit = static_cast<VersionEdit*>(arg); });
|
2022-06-10 15:21:08 +00:00
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"VersionSet::LogAndApply:BeforeWriterWaiting",
|
|
|
|
[&](void* /*arg*/) { version_edit->SetFullHistoryTsLow(higher_ts_low); });
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
ASSERT_TRUE(
|
|
|
|
db_->IncreaseFullHistoryTsLow(db_->DefaultColumnFamily(), lower_ts_low)
|
|
|
|
.IsTryAgain());
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
SyncPoint::GetInstance()->ClearAllCallBacks();
|
|
|
|
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
2023-07-10 18:03:25 +00:00
|
|
|
// Tests the effect of flag `persist_user_defined_timestamps` on the file
|
|
|
|
// boundaries contained in the Manifest, a.k.a FileMetaData.smallest,
|
|
|
|
// FileMetaData.largest.
|
|
|
|
class HandleFileBoundariesTest
|
|
|
|
: public DBBasicTestWithTimestampBase,
|
|
|
|
public testing::WithParamInterface<test::UserDefinedTimestampTestMode> {
|
|
|
|
public:
|
|
|
|
HandleFileBoundariesTest()
|
|
|
|
: DBBasicTestWithTimestampBase("/handle_file_boundaries") {}
|
|
|
|
};
|
|
|
|
|
2024-01-29 19:37:34 +00:00
|
|
|
TEST_P(HandleFileBoundariesTest, ConfigurePersistUdtWithPut) {
|
2023-07-10 18:03:25 +00:00
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.env = env_;
|
|
|
|
// Write a timestamp that is not the min timestamp to help test the behavior
|
|
|
|
// of flag `persist_user_defined_timestamps`.
|
2023-07-26 23:25:06 +00:00
|
|
|
std::string write_ts;
|
|
|
|
std::string min_ts;
|
|
|
|
PutFixed64(&write_ts, 1);
|
|
|
|
PutFixed64(&min_ts, 0);
|
2023-07-10 18:03:25 +00:00
|
|
|
std::string smallest_ukey_without_ts = "bar";
|
|
|
|
std::string largest_ukey_without_ts = "foo";
|
2023-07-26 23:25:06 +00:00
|
|
|
options.comparator = test::BytewiseComparatorWithU64TsWrapper();
|
2023-07-10 18:03:25 +00:00
|
|
|
bool persist_udt = test::ShouldPersistUDT(GetParam());
|
|
|
|
options.persist_user_defined_timestamps = persist_udt;
|
2023-07-26 23:25:06 +00:00
|
|
|
if (!persist_udt) {
|
|
|
|
options.allow_concurrent_memtable_write = false;
|
|
|
|
}
|
2023-07-10 18:03:25 +00:00
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
ASSERT_OK(
|
|
|
|
db_->Put(WriteOptions(), smallest_ukey_without_ts, write_ts, "val1"));
|
|
|
|
ASSERT_OK(
|
|
|
|
db_->Put(WriteOptions(), largest_ukey_without_ts, write_ts, "val2"));
|
|
|
|
|
2024-01-29 19:37:34 +00:00
|
|
|
// Create a L0 SST file and its record is added to the Manifest.
|
2023-07-10 18:03:25 +00:00
|
|
|
ASSERT_OK(Flush());
|
|
|
|
Close();
|
|
|
|
|
|
|
|
options.create_if_missing = false;
|
|
|
|
// Reopen the DB and process manifest file.
|
|
|
|
Reopen(options);
|
|
|
|
|
|
|
|
std::vector<std::vector<FileMetaData>> level_to_files;
|
|
|
|
dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(),
|
|
|
|
&level_to_files);
|
|
|
|
ASSERT_GT(level_to_files.size(), 1);
|
|
|
|
// L0 only has one SST file.
|
|
|
|
ASSERT_EQ(level_to_files[0].size(), 1);
|
|
|
|
auto file_meta = level_to_files[0][0];
|
|
|
|
if (persist_udt) {
|
|
|
|
ASSERT_EQ(smallest_ukey_without_ts + write_ts,
|
|
|
|
file_meta.smallest.user_key());
|
|
|
|
ASSERT_EQ(largest_ukey_without_ts + write_ts, file_meta.largest.user_key());
|
|
|
|
} else {
|
|
|
|
// If `persist_user_defined_timestamps` is false, the file boundaries should
|
|
|
|
// have the min timestamp. Behind the scenes, when file boundaries in
|
|
|
|
// FileMetaData is persisted to Manifest, the original user-defined
|
|
|
|
// timestamps in user key are stripped. When manifest is read and processed
|
|
|
|
// during DB open, a min timestamp is padded to the file boundaries. This
|
|
|
|
// test's writes contain non min timestamp to verify this logic end-to-end.
|
|
|
|
ASSERT_EQ(smallest_ukey_without_ts + min_ts, file_meta.smallest.user_key());
|
|
|
|
ASSERT_EQ(largest_ukey_without_ts + min_ts, file_meta.largest.user_key());
|
|
|
|
}
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
2024-01-29 19:37:34 +00:00
|
|
|
TEST_P(HandleFileBoundariesTest, ConfigurePersistUdtWithRangeDelete) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.env = env_;
|
|
|
|
// Write a timestamp that is not the min/max timestamp to help test the
|
|
|
|
// behavior of flag `persist_user_defined_timestamps`.
|
|
|
|
std::string write_ts;
|
|
|
|
std::string min_ts;
|
|
|
|
std::string max_ts;
|
|
|
|
PutFixed64(&write_ts, 1);
|
|
|
|
PutFixed64(&min_ts, 0);
|
|
|
|
PutFixed64(&max_ts, std::numeric_limits<uint64_t>::max());
|
|
|
|
std::string smallest_ukey_without_ts = "bar";
|
|
|
|
std::string largest_ukey_without_ts = "foo";
|
|
|
|
options.comparator = test::BytewiseComparatorWithU64TsWrapper();
|
|
|
|
bool persist_udt = test::ShouldPersistUDT(GetParam());
|
|
|
|
options.persist_user_defined_timestamps = persist_udt;
|
|
|
|
if (!persist_udt) {
|
|
|
|
options.allow_concurrent_memtable_write = false;
|
|
|
|
}
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
|
|
|
|
smallest_ukey_without_ts, largest_ukey_without_ts,
|
|
|
|
write_ts));
|
|
|
|
|
|
|
|
// Create a L0 SST file and its record is added to the Manifest.
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
Close();
|
|
|
|
|
|
|
|
options.create_if_missing = false;
|
|
|
|
// Reopen the DB and process manifest file.
|
|
|
|
Reopen(options);
|
|
|
|
|
|
|
|
std::vector<std::vector<FileMetaData>> level_to_files;
|
|
|
|
dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(),
|
|
|
|
&level_to_files);
|
|
|
|
ASSERT_GT(level_to_files.size(), 1);
|
|
|
|
// L0 only has one SST file.
|
|
|
|
ASSERT_EQ(level_to_files[0].size(), 1);
|
|
|
|
auto file_meta = level_to_files[0][0];
|
|
|
|
if (persist_udt) {
|
|
|
|
ASSERT_EQ(smallest_ukey_without_ts + write_ts,
|
|
|
|
file_meta.smallest.user_key());
|
|
|
|
} else {
|
|
|
|
ASSERT_EQ(smallest_ukey_without_ts + min_ts, file_meta.smallest.user_key());
|
|
|
|
}
|
|
|
|
// When right file boundary comes from range deletion, it uses max timestamp
|
|
|
|
// and a range deletion sentinel that uses the max sequence number to mark the
|
|
|
|
// end key exclusive. This is regardless of whether timestamp is persisted.
|
|
|
|
ASSERT_EQ(largest_ukey_without_ts + max_ts, file_meta.largest.user_key());
|
|
|
|
auto largest_footer = ExtractInternalKeyFooter(file_meta.largest.Encode());
|
|
|
|
ASSERT_EQ(largest_footer, kRangeTombstoneSentinel);
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
2023-07-10 18:03:25 +00:00
|
|
|
INSTANTIATE_TEST_CASE_P(
|
|
|
|
ConfigurePersistUdt, HandleFileBoundariesTest,
|
|
|
|
::testing::Values(
|
|
|
|
test::UserDefinedTimestampTestMode::kStripUserDefinedTimestamp,
|
|
|
|
test::UserDefinedTimestampTestMode::kNormal));
|
|
|
|
|
2023-07-27 03:16:32 +00:00
|
|
|
TEST_F(DBBasicTestWithTimestamp, EnableDisableUDT) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.env = env_;
|
|
|
|
// Create a column family without user-defined timestamps.
|
|
|
|
options.comparator = BytewiseComparator();
|
|
|
|
options.persist_user_defined_timestamps = true;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
// Create one SST file, its user keys have no user-defined timestamps.
|
|
|
|
ASSERT_OK(db_->Put(WriteOptions(), "foo", "val1"));
|
|
|
|
ASSERT_OK(Flush(0));
|
|
|
|
Close();
|
|
|
|
|
|
|
|
// Reopen the existing column family and enable user-defined timestamps
|
|
|
|
// feature for it.
|
|
|
|
options.comparator = test::BytewiseComparatorWithU64TsWrapper();
|
|
|
|
options.persist_user_defined_timestamps = false;
|
|
|
|
options.allow_concurrent_memtable_write = false;
|
|
|
|
Reopen(options);
|
|
|
|
|
|
|
|
std::string value;
|
|
|
|
ASSERT_TRUE(db_->Get(ReadOptions(), "foo", &value).IsInvalidArgument());
|
|
|
|
std::string read_ts;
|
|
|
|
PutFixed64(&read_ts, 0);
|
|
|
|
ReadOptions ropts;
|
|
|
|
Slice read_ts_slice = read_ts;
|
|
|
|
ropts.timestamp = &read_ts_slice;
|
|
|
|
std::string key_ts;
|
|
|
|
// Entries in pre-existing SST files are treated as if they have minimum
|
|
|
|
// user-defined timestamps.
|
|
|
|
ASSERT_OK(db_->Get(ropts, "foo", &value, &key_ts));
|
|
|
|
ASSERT_EQ("val1", value);
|
|
|
|
ASSERT_EQ(read_ts, key_ts);
|
|
|
|
|
|
|
|
// Do timestamped read / write.
|
|
|
|
std::string write_ts;
|
|
|
|
PutFixed64(&write_ts, 1);
|
|
|
|
ASSERT_OK(db_->Put(WriteOptions(), "foo", write_ts, "val2"));
|
|
|
|
read_ts.clear();
|
|
|
|
PutFixed64(&read_ts, 1);
|
|
|
|
ASSERT_OK(db_->Get(ropts, "foo", &value, &key_ts));
|
|
|
|
ASSERT_EQ("val2", value);
|
|
|
|
ASSERT_EQ(write_ts, key_ts);
|
|
|
|
// The user keys in this SST file don't have user-defined timestamps either,
|
|
|
|
// because `persist_user_defined_timestamps` flag is set to false.
|
|
|
|
ASSERT_OK(Flush(0));
|
|
|
|
Close();
|
|
|
|
|
|
|
|
// Reopen the existing column family while disabling user-defined timestamps.
|
|
|
|
options.comparator = BytewiseComparator();
|
|
|
|
Reopen(options);
|
|
|
|
|
|
|
|
ASSERT_TRUE(db_->Get(ropts, "foo", &value).IsInvalidArgument());
|
|
|
|
ASSERT_OK(db_->Get(ReadOptions(), "foo", &value));
|
|
|
|
ASSERT_EQ("val2", value);
|
|
|
|
|
|
|
|
// Continue to write / read the column family without user-defined timestamps.
|
|
|
|
ASSERT_OK(db_->Put(WriteOptions(), "foo", "val3"));
|
|
|
|
ASSERT_OK(db_->Get(ReadOptions(), "foo", &value));
|
|
|
|
ASSERT_EQ("val3", value);
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
2023-09-13 23:34:18 +00:00
|
|
|
// Tests that as long as the
|
|
|
|
// `ReadOptions.timestamp >= SuperVersion.full_history_ts_low` sanity check
|
|
|
|
// passes. The read will be consistent even if the column family's
|
|
|
|
// full_history_ts_low is concurrently increased and collapsed some history
|
|
|
|
// above `ReadOptions.timestamp`.
|
|
|
|
TEST_F(DBBasicTestWithTimestamp,
|
|
|
|
FullHistoryTsLowSanityCheckPassReadIsConsistent) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.env = env_;
|
|
|
|
options.comparator = test::BytewiseComparatorWithU64TsWrapper();
|
|
|
|
// Use UDT in memtable only feature for this test, so we can control that
|
|
|
|
// newly set `full_history_ts_low` collapse history when Flush happens.
|
|
|
|
options.persist_user_defined_timestamps = false;
|
|
|
|
options.allow_concurrent_memtable_write = false;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
std::string min_ts;
|
|
|
|
PutFixed64(&min_ts, 0);
|
|
|
|
|
|
|
|
// Write two versions of the key (1, v1), (3, v3), and always read with
|
|
|
|
// timestamp 2.
|
|
|
|
std::string write_ts;
|
|
|
|
PutFixed64(&write_ts, 1);
|
|
|
|
ASSERT_OK(db_->Put(WriteOptions(), "foo", write_ts, "val1"));
|
|
|
|
|
|
|
|
std::string read_ts;
|
|
|
|
PutFixed64(&read_ts, 2);
|
|
|
|
Slice read_ts_slice = read_ts;
|
|
|
|
ReadOptions read_opts;
|
|
|
|
read_opts.timestamp = &read_ts_slice;
|
|
|
|
|
|
|
|
// First read, no full_history_ts_low set, sanity check pass.
|
|
|
|
std::string value;
|
|
|
|
std::string timestamp;
|
|
|
|
ASSERT_OK(db_->Get(read_opts, "foo", &value, ×tamp));
|
|
|
|
ASSERT_EQ("val1", value);
|
|
|
|
ASSERT_EQ(write_ts, timestamp);
|
|
|
|
|
|
|
|
std::string full_history_ts_low;
|
|
|
|
std::string marked_ts_low;
|
|
|
|
PutFixed64(&full_history_ts_low, 2);
|
|
|
|
marked_ts_low = full_history_ts_low;
|
|
|
|
ASSERT_OK(db_->IncreaseFullHistoryTsLow(db_->DefaultColumnFamily(),
|
|
|
|
full_history_ts_low));
|
|
|
|
ASSERT_OK(Flush(0));
|
|
|
|
|
|
|
|
// Write the (3, v3) entry after flush, otherwise with UDT in memtable only
|
|
|
|
// the previous Flush(0) with full_history_ts_low = 2 will be postponed
|
|
|
|
// waiting for (3, v3) to expire too.
|
|
|
|
write_ts.clear();
|
|
|
|
PutFixed64(&write_ts, 3);
|
|
|
|
ASSERT_OK(db_->Put(WriteOptions(), "foo", write_ts, "val3"));
|
|
|
|
|
|
|
|
// Second read:
|
|
|
|
// ReadOptions.timestamp(2) >= SuperVersion.full_history_ts_low(2),
|
|
|
|
// and ReadOptions.timestamp(2) >= ColumnFamilyData.full_history_ts_low(2).
|
|
|
|
// history below 2 is collapsed. Reading at 2 or above 2 is ok.
|
|
|
|
// Sanity check pass. Read return consistent value, but timestamp is already
|
|
|
|
// collapsed.
|
|
|
|
ASSERT_OK(db_->Get(read_opts, "foo", &value, ×tamp));
|
|
|
|
ASSERT_EQ("val1", value);
|
|
|
|
ASSERT_EQ(min_ts, timestamp);
|
|
|
|
|
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"DBImpl::GetImpl:AfterAcquireSv", [&](void* /*arg*/) {
|
|
|
|
// Concurrently increasing full_history_ts_low and flush to create a
|
|
|
|
// new SuperVersion
|
|
|
|
std::string current_ts_low;
|
|
|
|
ASSERT_OK(db_->GetFullHistoryTsLow(db_->DefaultColumnFamily(),
|
|
|
|
¤t_ts_low));
|
|
|
|
if (current_ts_low.empty() || current_ts_low != marked_ts_low) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
full_history_ts_low.clear();
|
|
|
|
PutFixed64(&full_history_ts_low, 4);
|
|
|
|
ASSERT_OK(db_->IncreaseFullHistoryTsLow(db_->DefaultColumnFamily(),
|
|
|
|
full_history_ts_low));
|
|
|
|
ASSERT_OK(Flush(0));
|
|
|
|
});
|
|
|
|
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
// Third read:
|
|
|
|
// ReadOptions.timestamp(2) >= SuperVersion.full_history_ts_low(2),
|
|
|
|
// but ReadOptions.timestamp(2) < ColumnFamilyData.full_history_ts_low(4).
|
|
|
|
// History below 4 is collapsed in the newly installed SuperVersion. But the
|
|
|
|
// SuperVersion attached to this read still has the history below 4 available.
|
|
|
|
// Sanity check pass. Read return consistent value, timestamp is collapsed.
|
|
|
|
ASSERT_OK(db_->Get(read_opts, "foo", &value, ×tamp));
|
|
|
|
ASSERT_EQ("val1", value);
|
|
|
|
ASSERT_EQ(min_ts, timestamp);
|
|
|
|
|
|
|
|
// Fourth read:
|
|
|
|
// ReadOptions.timestamp(2) < SuperVersion.full_history_ts_low(4).
|
|
|
|
// Sanity check fails. Had it succeeded, the read would return "v3",
|
|
|
|
// which is inconsistent.
|
|
|
|
ASSERT_TRUE(
|
|
|
|
db_->Get(read_opts, "foo", &value, ×tamp).IsInvalidArgument());
|
|
|
|
Close();
|
|
|
|
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
SyncPoint::GetInstance()->ClearAllCallBacks();
|
|
|
|
}
|
|
|
|
|
|
|
|
// Tests that in cases when
|
|
|
|
// `ReadOptions.timestamp >= SuperVersion.full_history_ts_low` sanity check
|
|
|
|
// fails. The referenced SuperVersion is dereferenced and cleaned up properly
|
|
|
|
// for all read APIs that involves this sanity check.
|
|
|
|
TEST_F(DBBasicTestWithTimestamp, FullHistoryTsLowSanityCheckFail) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.env = env_;
|
|
|
|
options.comparator = test::BytewiseComparatorWithU64TsWrapper();
|
|
|
|
// Use UDT in memtable only feature for this test, so we can control that
|
|
|
|
// newly set `full_history_ts_low` collapse history when Flush happens.
|
|
|
|
options.persist_user_defined_timestamps = false;
|
|
|
|
options.allow_concurrent_memtable_write = false;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
ColumnFamilyHandle* handle2 = nullptr;
|
|
|
|
Status s = db_->CreateColumnFamily(options, "data", &handle2);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
std::string write_ts;
|
|
|
|
PutFixed64(&write_ts, 1);
|
|
|
|
ASSERT_OK(db_->Put(WriteOptions(), "foo", write_ts, "val1"));
|
|
|
|
ASSERT_OK(db_->Put(WriteOptions(), handle2, "foo", write_ts, "val1"));
|
|
|
|
|
|
|
|
std::string full_history_ts_low;
|
|
|
|
PutFixed64(&full_history_ts_low, 3);
|
|
|
|
ASSERT_OK(db_->IncreaseFullHistoryTsLow(db_->DefaultColumnFamily(),
|
|
|
|
full_history_ts_low));
|
|
|
|
ASSERT_OK(db_->IncreaseFullHistoryTsLow(handle2, full_history_ts_low));
|
|
|
|
ASSERT_OK(Flush(0));
|
|
|
|
ASSERT_OK(db_->Flush(FlushOptions(), handle2));
|
|
|
|
|
|
|
|
std::string read_ts;
|
|
|
|
PutFixed64(&read_ts, 2);
|
|
|
|
Slice read_ts_slice = read_ts;
|
|
|
|
ReadOptions read_opts;
|
|
|
|
read_opts.timestamp = &read_ts_slice;
|
|
|
|
|
|
|
|
// Get()
|
|
|
|
std::string value;
|
|
|
|
ASSERT_TRUE(db_->Get(read_opts, "foo", &value).IsInvalidArgument());
|
|
|
|
|
|
|
|
// MultiGet()
|
|
|
|
std::vector<ColumnFamilyHandle*> cfhs = {db_->DefaultColumnFamily(), handle2};
|
|
|
|
{
|
|
|
|
std::vector<std::string> key_vals = {"foo", "foo"};
|
|
|
|
std::vector<Slice> keys;
|
|
|
|
std::vector<std::string> values;
|
|
|
|
for (size_t j = 0; j < 2; ++j) {
|
2024-01-05 19:53:57 +00:00
|
|
|
keys.emplace_back(key_vals[j]);
|
2023-09-13 23:34:18 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<Status> statuses =
|
|
|
|
db_->MultiGet(read_opts, cfhs, keys, &values);
|
2024-01-05 19:53:57 +00:00
|
|
|
for (const auto& status : statuses) {
|
2023-09-13 23:34:18 +00:00
|
|
|
ASSERT_TRUE(status.IsInvalidArgument());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// MultiGet with only one column family
|
|
|
|
{
|
|
|
|
std::vector<ColumnFamilyHandle*> one_cfh = {db_->DefaultColumnFamily()};
|
|
|
|
std::vector<std::string> key_vals = {"foo"};
|
|
|
|
std::vector<Slice> keys;
|
|
|
|
std::vector<std::string> values;
|
|
|
|
for (size_t j = 0; j < 1; ++j) {
|
2024-01-05 19:53:57 +00:00
|
|
|
keys.emplace_back(key_vals[j]);
|
2023-09-13 23:34:18 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<Status> statuses =
|
|
|
|
db_->MultiGet(read_opts, one_cfh, keys, &values);
|
2024-01-05 19:53:57 +00:00
|
|
|
for (const auto& status : statuses) {
|
2023-09-13 23:34:18 +00:00
|
|
|
ASSERT_TRUE(status.IsInvalidArgument());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Overloaded version of MultiGet
|
|
|
|
ColumnFamilyHandle* column_families[] = {db_->DefaultColumnFamily(), handle2};
|
|
|
|
{
|
|
|
|
Slice keys[] = {"foo", "foo"};
|
|
|
|
PinnableSlice values[] = {PinnableSlice(), PinnableSlice()};
|
|
|
|
Status statuses[] = {Status::OK(), Status::OK()};
|
|
|
|
db_->MultiGet(read_opts, /*num_keys=*/2, &column_families[0], &keys[0],
|
|
|
|
&values[0], &statuses[0], /*sorted_input=*/false);
|
2024-01-05 19:53:57 +00:00
|
|
|
for (const auto& status : statuses) {
|
2023-09-13 23:34:18 +00:00
|
|
|
ASSERT_TRUE(status.IsInvalidArgument());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Overloaded versions of MultiGet with one column family
|
|
|
|
{
|
|
|
|
ColumnFamilyHandle* one_column_family[] = {db_->DefaultColumnFamily()};
|
|
|
|
Slice keys[] = {"foo"};
|
|
|
|
PinnableSlice values[] = {PinnableSlice()};
|
|
|
|
Status statuses[] = {Status::OK()};
|
|
|
|
db_->MultiGet(read_opts, /*num_keys=*/1, &one_column_family[0], &keys[0],
|
|
|
|
&values[0], &statuses[0], /*sorted_input=*/false);
|
2024-01-05 19:53:57 +00:00
|
|
|
for (const auto& status : statuses) {
|
2023-09-13 23:34:18 +00:00
|
|
|
ASSERT_TRUE(status.IsInvalidArgument());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// NewIterator()
|
|
|
|
std::unique_ptr<Iterator> iter(
|
|
|
|
db_->NewIterator(read_opts, db_->DefaultColumnFamily()));
|
|
|
|
ASSERT_TRUE(iter->status().IsInvalidArgument());
|
|
|
|
std::unique_ptr<Iterator> iter2(db_->NewIterator(read_opts, handle2));
|
|
|
|
ASSERT_TRUE(iter2->status().IsInvalidArgument());
|
|
|
|
|
|
|
|
// NewIterators()
|
|
|
|
std::vector<Iterator*> iterators;
|
|
|
|
ASSERT_TRUE(
|
|
|
|
db_->NewIterators(read_opts, cfhs, &iterators).IsInvalidArgument());
|
|
|
|
delete handle2;
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
2022-09-30 23:13:03 +00:00
|
|
|
TEST_F(DBBasicTestWithTimestamp,
|
|
|
|
GCPreserveRangeTombstoneWhenNoOrSmallFullHistoryLow) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.env = env_;
|
|
|
|
options.create_if_missing = true;
|
|
|
|
const size_t kTimestampSize = Timestamp(0, 0).size();
|
|
|
|
TestComparator test_cmp(kTimestampSize);
|
|
|
|
options.comparator = &test_cmp;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
std::string ts_str = Timestamp(1, 0);
|
|
|
|
WriteOptions wopts;
|
|
|
|
ASSERT_OK(db_->Put(wopts, "k1", ts_str, "v1"));
|
|
|
|
ASSERT_OK(db_->Put(wopts, "k2", ts_str, "v2"));
|
|
|
|
ASSERT_OK(db_->Put(wopts, "k3", ts_str, "v3"));
|
|
|
|
ts_str = Timestamp(2, 0);
|
|
|
|
ASSERT_OK(
|
|
|
|
db_->DeleteRange(wopts, db_->DefaultColumnFamily(), "k1", "k3", ts_str));
|
|
|
|
|
|
|
|
ts_str = Timestamp(3, 0);
|
|
|
|
Slice ts = ts_str;
|
|
|
|
ReadOptions ropts;
|
|
|
|
ropts.timestamp = &ts;
|
|
|
|
CompactRangeOptions cro;
|
|
|
|
cro.full_history_ts_low = nullptr;
|
|
|
|
std::string value, key_ts;
|
|
|
|
Status s;
|
|
|
|
auto verify = [&] {
|
|
|
|
s = db_->Get(ropts, "k1", &value);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
|
|
|
|
s = db_->Get(ropts, "k2", &value, &key_ts);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
ASSERT_EQ(key_ts, Timestamp(2, 0));
|
|
|
|
|
|
|
|
ASSERT_OK(db_->Get(ropts, "k3", &value, &key_ts));
|
|
|
|
ASSERT_EQ(value, "v3");
|
|
|
|
ASSERT_EQ(Timestamp(1, 0), key_ts);
|
|
|
|
|
|
|
|
size_t batch_size = 3;
|
|
|
|
std::vector<std::string> key_strs = {"k1", "k2", "k3"};
|
|
|
|
std::vector<Slice> keys{key_strs.begin(), key_strs.end()};
|
|
|
|
std::vector<PinnableSlice> values(batch_size);
|
|
|
|
std::vector<Status> statuses(batch_size);
|
|
|
|
db_->MultiGet(ropts, db_->DefaultColumnFamily(), batch_size, keys.data(),
|
|
|
|
values.data(), statuses.data(), true /* sorted_input */);
|
|
|
|
ASSERT_TRUE(statuses[0].IsNotFound());
|
|
|
|
ASSERT_TRUE(statuses[1].IsNotFound());
|
|
|
|
ASSERT_OK(statuses[2]);
|
|
|
|
;
|
|
|
|
ASSERT_EQ(values[2], "v3");
|
|
|
|
};
|
|
|
|
verify();
|
|
|
|
ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
|
|
|
|
verify();
|
|
|
|
std::string lb = Timestamp(0, 0);
|
|
|
|
Slice lb_slice = lb;
|
|
|
|
cro.full_history_ts_low = &lb_slice;
|
|
|
|
ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
|
|
|
|
verify();
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBBasicTestWithTimestamp,
|
|
|
|
GCRangeTombstonesAndCoveredKeysRespectingTslow) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.env = env_;
|
|
|
|
options.create_if_missing = true;
|
|
|
|
BlockBasedTableOptions bbto;
|
|
|
|
bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
|
|
|
|
bbto.cache_index_and_filter_blocks = true;
|
|
|
|
bbto.whole_key_filtering = true;
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(bbto));
|
|
|
|
const size_t kTimestampSize = Timestamp(0, 0).size();
|
|
|
|
TestComparator test_cmp(kTimestampSize);
|
|
|
|
options.comparator = &test_cmp;
|
|
|
|
options.num_levels = 2;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
WriteOptions wopts;
|
|
|
|
ASSERT_OK(db_->Put(wopts, "k1", Timestamp(1, 0), "v1"));
|
|
|
|
ASSERT_OK(db_->Delete(wopts, "k2", Timestamp(2, 0)));
|
|
|
|
ASSERT_OK(db_->DeleteRange(wopts, db_->DefaultColumnFamily(), "k1", "k3",
|
|
|
|
Timestamp(3, 0)));
|
|
|
|
ASSERT_OK(db_->Put(wopts, "k3", Timestamp(4, 0), "v3"));
|
|
|
|
|
|
|
|
ReadOptions ropts;
|
|
|
|
std::string read_ts = Timestamp(5, 0);
|
|
|
|
Slice read_ts_slice = read_ts;
|
|
|
|
ropts.timestamp = &read_ts_slice;
|
|
|
|
size_t batch_size = 3;
|
|
|
|
std::vector<std::string> key_strs = {"k1", "k2", "k3"};
|
|
|
|
std::vector<Slice> keys = {key_strs.begin(), key_strs.end()};
|
|
|
|
std::vector<PinnableSlice> values(batch_size);
|
|
|
|
std::vector<Status> statuses(batch_size);
|
|
|
|
std::vector<std::string> timestamps(batch_size);
|
|
|
|
db_->MultiGet(ropts, db_->DefaultColumnFamily(), batch_size, keys.data(),
|
|
|
|
values.data(), timestamps.data(), statuses.data(),
|
|
|
|
true /* sorted_input */);
|
|
|
|
ASSERT_TRUE(statuses[0].IsNotFound());
|
|
|
|
ASSERT_EQ(timestamps[0], Timestamp(3, 0));
|
|
|
|
ASSERT_TRUE(statuses[1].IsNotFound());
|
|
|
|
// DeleteRange has a higher timestamp than Delete for "k2"
|
|
|
|
ASSERT_EQ(timestamps[1], Timestamp(3, 0));
|
|
|
|
ASSERT_OK(statuses[2]);
|
|
|
|
ASSERT_EQ(values[2], "v3");
|
|
|
|
ASSERT_EQ(timestamps[2], Timestamp(4, 0));
|
|
|
|
|
|
|
|
CompactRangeOptions cro;
|
|
|
|
// Range tombstone has timestamp >= full_history_ts_low, covered keys
|
|
|
|
// are not dropped.
|
|
|
|
std::string compaction_ts_str = Timestamp(2, 0);
|
|
|
|
Slice compaction_ts = compaction_ts_str;
|
|
|
|
cro.full_history_ts_low = &compaction_ts;
|
|
|
|
cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
|
|
|
|
ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
|
|
|
|
ropts.timestamp = &compaction_ts;
|
|
|
|
std::string value, ts;
|
|
|
|
ASSERT_OK(db_->Get(ropts, "k1", &value, &ts));
|
|
|
|
ASSERT_EQ(value, "v1");
|
|
|
|
// timestamp is below full_history_ts_low, zeroed out as the key goes into
|
|
|
|
// bottommost level
|
|
|
|
ASSERT_EQ(ts, Timestamp(0, 0));
|
|
|
|
ASSERT_TRUE(db_->Get(ropts, "k2", &value, &ts).IsNotFound());
|
|
|
|
ASSERT_EQ(ts, Timestamp(2, 0));
|
|
|
|
|
|
|
|
compaction_ts_str = Timestamp(4, 0);
|
|
|
|
compaction_ts = compaction_ts_str;
|
|
|
|
cro.full_history_ts_low = &compaction_ts;
|
|
|
|
ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
|
|
|
|
ropts.timestamp = &read_ts_slice;
|
|
|
|
// k1, k2 and the range tombstone should be dropped
|
|
|
|
// k3 should still exist
|
|
|
|
db_->MultiGet(ropts, db_->DefaultColumnFamily(), batch_size, keys.data(),
|
|
|
|
values.data(), timestamps.data(), statuses.data(),
|
|
|
|
true /* sorted_input */);
|
|
|
|
ASSERT_TRUE(statuses[0].IsNotFound());
|
|
|
|
ASSERT_TRUE(timestamps[0].empty());
|
|
|
|
ASSERT_TRUE(statuses[1].IsNotFound());
|
|
|
|
ASSERT_TRUE(timestamps[1].empty());
|
|
|
|
ASSERT_OK(statuses[2]);
|
|
|
|
ASSERT_EQ(values[2], "v3");
|
|
|
|
ASSERT_EQ(timestamps[2], Timestamp(4, 0));
|
|
|
|
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
2024-01-29 19:37:34 +00:00
|
|
|
class DeleteRangeWithTimestampTableOptions
|
|
|
|
: public DBBasicTestWithTimestampBase,
|
|
|
|
public testing::WithParamInterface<
|
|
|
|
std::tuple<BlockBasedTableOptions::IndexType,
|
|
|
|
test::UserDefinedTimestampTestMode>> {
|
|
|
|
public:
|
|
|
|
explicit DeleteRangeWithTimestampTableOptions()
|
|
|
|
: DBBasicTestWithTimestampBase(
|
|
|
|
"delete_range_with_timestamp_table_options") {}
|
|
|
|
};
|
|
|
|
|
|
|
|
INSTANTIATE_TEST_CASE_P(
|
|
|
|
Timestamp, DeleteRangeWithTimestampTableOptions,
|
|
|
|
testing::Combine(
|
|
|
|
testing::Values(
|
|
|
|
BlockBasedTableOptions::IndexType::kBinarySearch,
|
|
|
|
BlockBasedTableOptions::IndexType::kHashSearch,
|
|
|
|
BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch,
|
|
|
|
BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey),
|
|
|
|
testing::Values(
|
|
|
|
test::UserDefinedTimestampTestMode::kNormal,
|
|
|
|
test::UserDefinedTimestampTestMode::kStripUserDefinedTimestamp)));
|
|
|
|
|
|
|
|
TEST_P(DeleteRangeWithTimestampTableOptions, BasicReadAndIterate) {
|
2022-09-30 23:13:03 +00:00
|
|
|
const int kNum = 200, kRangeBegin = 50, kRangeEnd = 150, kNumPerFile = 25;
|
|
|
|
Options options = CurrentOptions();
|
2024-01-29 19:37:34 +00:00
|
|
|
options.disable_auto_compactions = true;
|
2022-09-30 23:13:03 +00:00
|
|
|
options.prefix_extractor.reset(NewFixedPrefixTransform(3));
|
|
|
|
options.compression = kNoCompression;
|
|
|
|
BlockBasedTableOptions bbto;
|
2024-01-29 19:37:34 +00:00
|
|
|
bbto.index_type = std::get<0>(GetParam());
|
2022-09-30 23:13:03 +00:00
|
|
|
bbto.block_size = 100;
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(bbto));
|
|
|
|
options.env = env_;
|
|
|
|
options.create_if_missing = true;
|
2024-01-29 19:37:34 +00:00
|
|
|
bool persist_udt = test::ShouldPersistUDT(std::get<1>(GetParam()));
|
|
|
|
options.comparator = test::BytewiseComparatorWithU64TsWrapper();
|
|
|
|
options.persist_user_defined_timestamps = persist_udt;
|
|
|
|
// UDT in memtables only not compatible with concurrent memtable writes.
|
|
|
|
options.allow_concurrent_memtable_write = persist_udt;
|
2022-09-30 23:13:03 +00:00
|
|
|
options.memtable_factory.reset(test::NewSpecialSkipListFactory(kNumPerFile));
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
// Write half of the keys before the tombstone and half after the tombstone.
|
|
|
|
// Only covered keys (i.e., within the range and older than the tombstone)
|
|
|
|
// should be deleted.
|
2024-01-29 19:37:34 +00:00
|
|
|
std::string full_history_ts_low;
|
|
|
|
int cutoff_ts = 0;
|
2022-09-30 23:13:03 +00:00
|
|
|
for (int i = 0; i < kNum; ++i) {
|
2024-01-29 19:37:34 +00:00
|
|
|
std::string write_ts;
|
|
|
|
PutFixed64(&write_ts, i);
|
2022-09-30 23:13:03 +00:00
|
|
|
if (i == kNum / 2) {
|
|
|
|
ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
|
2024-01-29 19:37:34 +00:00
|
|
|
Key1(kRangeBegin), Key1(kRangeEnd), write_ts));
|
2022-09-30 23:13:03 +00:00
|
|
|
}
|
2024-01-29 19:37:34 +00:00
|
|
|
ASSERT_OK(
|
|
|
|
db_->Put(WriteOptions(), Key1(i), write_ts, "val" + std::to_string(i)));
|
2022-09-30 23:13:03 +00:00
|
|
|
if (i == kNum - kNumPerFile) {
|
2024-01-29 19:37:34 +00:00
|
|
|
if (!persist_udt) {
|
|
|
|
// When UDTs are not persisted, mark the timestamps in the Memtables as
|
|
|
|
// all expired so the followed flush can go through.
|
|
|
|
cutoff_ts = i + 1;
|
|
|
|
PutFixed64(&full_history_ts_low, cutoff_ts);
|
|
|
|
ASSERT_OK(db_->IncreaseFullHistoryTsLow(db_->DefaultColumnFamily(),
|
|
|
|
full_history_ts_low));
|
|
|
|
}
|
2022-09-30 23:13:03 +00:00
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
ReadOptions read_opts;
|
|
|
|
read_opts.total_order_seek = true;
|
2024-01-29 19:37:34 +00:00
|
|
|
std::string read_ts;
|
|
|
|
PutFixed64(&read_ts, kNum);
|
2022-09-30 23:13:03 +00:00
|
|
|
Slice read_ts_slice = read_ts;
|
|
|
|
read_opts.timestamp = &read_ts_slice;
|
|
|
|
{
|
|
|
|
std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
|
|
|
|
int expected = 0;
|
|
|
|
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
|
|
|
ASSERT_EQ(Key1(expected), iter->key());
|
|
|
|
if (expected == kRangeBegin - 1) {
|
|
|
|
expected = kNum / 2;
|
|
|
|
} else {
|
|
|
|
++expected;
|
|
|
|
}
|
|
|
|
}
|
2023-10-18 16:38:38 +00:00
|
|
|
ASSERT_OK(iter->status());
|
2022-09-30 23:13:03 +00:00
|
|
|
ASSERT_EQ(kNum, expected);
|
|
|
|
|
|
|
|
expected = kNum / 2;
|
|
|
|
for (iter->Seek(Key1(kNum / 2)); iter->Valid(); iter->Next()) {
|
|
|
|
ASSERT_EQ(Key1(expected), iter->key());
|
|
|
|
++expected;
|
|
|
|
}
|
2023-10-18 16:38:38 +00:00
|
|
|
ASSERT_OK(iter->status());
|
2022-09-30 23:13:03 +00:00
|
|
|
ASSERT_EQ(kNum, expected);
|
|
|
|
|
|
|
|
expected = kRangeBegin - 1;
|
|
|
|
for (iter->SeekForPrev(Key1(kNum / 2 - 1)); iter->Valid(); iter->Prev()) {
|
|
|
|
ASSERT_EQ(Key1(expected), iter->key());
|
|
|
|
--expected;
|
|
|
|
}
|
2023-10-18 16:38:38 +00:00
|
|
|
ASSERT_OK(iter->status());
|
2022-09-30 23:13:03 +00:00
|
|
|
ASSERT_EQ(-1, expected);
|
|
|
|
|
2024-01-29 19:37:34 +00:00
|
|
|
// Cannot read below the cutoff timestamp when timestamps are not persisted.
|
|
|
|
if (persist_udt) {
|
|
|
|
read_ts.clear();
|
|
|
|
PutFixed64(&read_ts, 0);
|
|
|
|
read_ts_slice = read_ts;
|
|
|
|
read_opts.timestamp = &read_ts_slice;
|
|
|
|
iter.reset(db_->NewIterator(read_opts));
|
|
|
|
iter->SeekToFirst();
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ(iter->key(), Key1(0));
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_FALSE(iter->Valid());
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
}
|
2022-09-30 23:13:03 +00:00
|
|
|
}
|
|
|
|
|
2024-01-29 19:37:34 +00:00
|
|
|
read_ts.clear();
|
|
|
|
PutFixed64(&read_ts, kNum);
|
2022-09-30 23:13:03 +00:00
|
|
|
read_ts_slice = read_ts;
|
|
|
|
read_opts.timestamp = &read_ts_slice;
|
|
|
|
std::string value, timestamp;
|
|
|
|
Status s;
|
2024-01-29 19:37:34 +00:00
|
|
|
std::string expected_ts;
|
|
|
|
int int_expected_ts;
|
2022-09-30 23:13:03 +00:00
|
|
|
for (int i = 0; i < kNum; ++i) {
|
|
|
|
s = db_->Get(read_opts, Key1(i), &value, ×tamp);
|
|
|
|
if (i >= kRangeBegin && i < kNum / 2) {
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
2024-01-29 19:37:34 +00:00
|
|
|
int_expected_ts = (persist_udt || kNum / 2 >= cutoff_ts) ? kNum / 2 : 0;
|
2022-09-30 23:13:03 +00:00
|
|
|
} else {
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ(value, "val" + std::to_string(i));
|
2024-01-29 19:37:34 +00:00
|
|
|
int_expected_ts = (persist_udt || i >= cutoff_ts) ? i : 0;
|
2022-09-30 23:13:03 +00:00
|
|
|
}
|
2024-01-29 19:37:34 +00:00
|
|
|
expected_ts.clear();
|
|
|
|
PutFixed64(&expected_ts, int_expected_ts);
|
|
|
|
ASSERT_EQ(timestamp, expected_ts);
|
2022-09-30 23:13:03 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
size_t batch_size = kNum;
|
|
|
|
std::vector<std::string> key_strs(batch_size);
|
|
|
|
std::vector<Slice> keys(batch_size);
|
|
|
|
std::vector<PinnableSlice> values(batch_size);
|
|
|
|
std::vector<Status> statuses(batch_size);
|
|
|
|
std::vector<std::string> timestamps(batch_size);
|
|
|
|
for (int i = 0; i < kNum; ++i) {
|
|
|
|
key_strs[i] = Key1(i);
|
|
|
|
keys[i] = key_strs[i];
|
|
|
|
}
|
|
|
|
db_->MultiGet(read_opts, db_->DefaultColumnFamily(), batch_size, keys.data(),
|
|
|
|
values.data(), timestamps.data(), statuses.data(),
|
|
|
|
true /* sorted_input */);
|
|
|
|
for (int i = 0; i < kNum; ++i) {
|
|
|
|
if (i >= kRangeBegin && i < kNum / 2) {
|
|
|
|
ASSERT_TRUE(statuses[i].IsNotFound());
|
2024-01-29 19:37:34 +00:00
|
|
|
int_expected_ts = (persist_udt || kNum / 2 >= cutoff_ts) ? kNum / 2 : 0;
|
2022-09-30 23:13:03 +00:00
|
|
|
} else {
|
|
|
|
ASSERT_OK(statuses[i]);
|
|
|
|
ASSERT_EQ(values[i], "val" + std::to_string(i));
|
2024-01-29 19:37:34 +00:00
|
|
|
int_expected_ts = (persist_udt || i >= cutoff_ts) ? i : 0;
|
|
|
|
}
|
|
|
|
expected_ts.clear();
|
|
|
|
PutFixed64(&expected_ts, int_expected_ts);
|
|
|
|
ASSERT_EQ(timestamps[i], expected_ts);
|
|
|
|
}
|
|
|
|
|
|
|
|
CompactRangeOptions cro;
|
|
|
|
cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
|
|
|
|
if (!persist_udt) {
|
|
|
|
// Mark everything expired so manual compaction can go through
|
|
|
|
full_history_ts_low.clear();
|
|
|
|
PutFixed64(&full_history_ts_low, kNum);
|
|
|
|
ASSERT_OK(db_->IncreaseFullHistoryTsLow(db_->DefaultColumnFamily(),
|
|
|
|
full_history_ts_low));
|
|
|
|
}
|
|
|
|
Slice compaction_ts = full_history_ts_low;
|
|
|
|
cro.full_history_ts_low = &compaction_ts;
|
|
|
|
ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
|
|
|
|
for (int i = kRangeBegin; i < kNum / 2; ++i) {
|
|
|
|
s = db_->Get(read_opts, Key1(i), &value, ×tamp);
|
|
|
|
ASSERT_TRUE(s.IsNotFound());
|
|
|
|
if (persist_udt) {
|
|
|
|
expected_ts.clear();
|
|
|
|
PutFixed64(&expected_ts, kNum / 2);
|
|
|
|
ASSERT_EQ(timestamp, expected_ts);
|
|
|
|
} else {
|
|
|
|
// When timestamps are not persisted, data in SST files all logically have
|
|
|
|
// min timestamp. A compaction to the last level will drop the range
|
|
|
|
// tombstone.
|
|
|
|
ASSERT_TRUE(timestamp.empty());
|
2022-09-30 23:13:03 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBBasicTestWithTimestamp, DeleteRangeGetIteratorWithSnapshot) {
|
|
|
|
// 4 keys 0, 1, 2, 3 at timestamps 0, 1, 2, 3 respectively.
|
|
|
|
// A range tombstone [1, 3) at timestamp 1 and has a sequence number between
|
|
|
|
// key 1 and 2.
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
const size_t kTimestampSize = Timestamp(0, 0).size();
|
|
|
|
TestComparator test_cmp(kTimestampSize);
|
|
|
|
options.comparator = &test_cmp;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
WriteOptions write_opts;
|
|
|
|
std::string put_ts = Timestamp(0, 0);
|
|
|
|
const int kNum = 4, kNumPerFile = 1, kRangeBegin = 1, kRangeEnd = 3;
|
|
|
|
options.memtable_factory.reset(test::NewSpecialSkipListFactory(kNumPerFile));
|
|
|
|
const Snapshot* before_tombstone = nullptr;
|
|
|
|
const Snapshot* after_tombstone = nullptr;
|
|
|
|
for (int i = 0; i < kNum; ++i) {
|
|
|
|
ASSERT_OK(db_->Put(WriteOptions(), Key1(i), Timestamp(i, 0),
|
|
|
|
"val" + std::to_string(i)));
|
|
|
|
if (i == kRangeBegin) {
|
|
|
|
before_tombstone = db_->GetSnapshot();
|
|
|
|
ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
|
|
|
|
Key1(kRangeBegin), Key1(kRangeEnd),
|
|
|
|
Timestamp(kRangeBegin, 0)));
|
|
|
|
}
|
|
|
|
if (i == kNum / 2) {
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
assert(before_tombstone);
|
|
|
|
after_tombstone = db_->GetSnapshot();
|
|
|
|
// snapshot and ts before tombstone
|
|
|
|
std::string read_ts_str = Timestamp(kRangeBegin - 1, 0); // (0, 0)
|
|
|
|
Slice read_ts = read_ts_str;
|
|
|
|
ReadOptions read_opts;
|
|
|
|
read_opts.timestamp = &read_ts;
|
|
|
|
read_opts.snapshot = before_tombstone;
|
|
|
|
std::vector<Status> expected_status = {
|
|
|
|
Status::OK(), Status::NotFound(), Status::NotFound(), Status::NotFound()};
|
|
|
|
std::vector<std::string> expected_values(kNum);
|
|
|
|
expected_values[0] = "val" + std::to_string(0);
|
|
|
|
std::vector<std::string> expected_timestamps(kNum);
|
|
|
|
expected_timestamps[0] = Timestamp(0, 0);
|
|
|
|
|
|
|
|
size_t batch_size = kNum;
|
|
|
|
std::vector<std::string> key_strs(batch_size);
|
|
|
|
std::vector<Slice> keys(batch_size);
|
|
|
|
std::vector<PinnableSlice> values(batch_size);
|
|
|
|
std::vector<Status> statuses(batch_size);
|
|
|
|
std::vector<std::string> timestamps(batch_size);
|
|
|
|
for (int i = 0; i < kNum; ++i) {
|
|
|
|
key_strs[i] = Key1(i);
|
|
|
|
keys[i] = key_strs[i];
|
|
|
|
}
|
|
|
|
|
|
|
|
auto verify = [&] {
|
|
|
|
db_->MultiGet(read_opts, db_->DefaultColumnFamily(), batch_size,
|
|
|
|
keys.data(), values.data(), timestamps.data(),
|
|
|
|
statuses.data(), true /* sorted_input */);
|
|
|
|
std::string value, timestamp;
|
|
|
|
Status s;
|
|
|
|
for (int i = 0; i < kNum; ++i) {
|
|
|
|
s = db_->Get(read_opts, Key1(i), &value, ×tamp);
|
|
|
|
ASSERT_EQ(s, expected_status[i]);
|
|
|
|
ASSERT_EQ(statuses[i], expected_status[i]);
|
|
|
|
if (s.ok()) {
|
|
|
|
ASSERT_EQ(value, expected_values[i]);
|
|
|
|
ASSERT_EQ(values[i], expected_values[i]);
|
|
|
|
}
|
|
|
|
if (!timestamp.empty()) {
|
|
|
|
ASSERT_EQ(timestamp, expected_timestamps[i]);
|
|
|
|
ASSERT_EQ(timestamps[i], expected_timestamps[i]);
|
|
|
|
} else {
|
|
|
|
ASSERT_TRUE(timestamps[i].empty());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
|
|
|
|
std::unique_ptr<Iterator> iter_for_seek(db_->NewIterator(read_opts));
|
|
|
|
iter->SeekToFirst();
|
|
|
|
for (int i = 0; i < kNum; ++i) {
|
|
|
|
if (expected_status[i].ok()) {
|
|
|
|
auto verify_iter = [&](Iterator* iter_ptr) {
|
|
|
|
ASSERT_TRUE(iter_ptr->Valid());
|
|
|
|
ASSERT_EQ(iter_ptr->key(), keys[i]);
|
|
|
|
ASSERT_EQ(iter_ptr->value(), expected_values[i]);
|
|
|
|
ASSERT_EQ(iter_ptr->timestamp(), expected_timestamps[i]);
|
|
|
|
};
|
|
|
|
verify_iter(iter.get());
|
|
|
|
iter->Next();
|
|
|
|
|
|
|
|
iter_for_seek->Seek(keys[i]);
|
|
|
|
verify_iter(iter_for_seek.get());
|
|
|
|
|
|
|
|
iter_for_seek->SeekForPrev(keys[i]);
|
|
|
|
verify_iter(iter_for_seek.get());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
ASSERT_FALSE(iter->Valid());
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
};
|
|
|
|
|
|
|
|
verify();
|
|
|
|
|
|
|
|
// snapshot before tombstone and ts after tombstone
|
|
|
|
read_ts_str = Timestamp(kNum, 0); // (4, 0)
|
|
|
|
read_ts = read_ts_str;
|
|
|
|
read_opts.timestamp = &read_ts;
|
|
|
|
read_opts.snapshot = before_tombstone;
|
|
|
|
expected_status[1] = Status::OK();
|
|
|
|
expected_timestamps[1] = Timestamp(1, 0);
|
|
|
|
expected_values[1] = "val" + std::to_string(1);
|
|
|
|
verify();
|
|
|
|
|
|
|
|
// snapshot after tombstone and ts before tombstone
|
|
|
|
read_ts_str = Timestamp(kRangeBegin - 1, 0); // (0, 0)
|
|
|
|
read_ts = read_ts_str;
|
|
|
|
read_opts.timestamp = &read_ts;
|
|
|
|
read_opts.snapshot = after_tombstone;
|
|
|
|
expected_status[1] = Status::NotFound();
|
|
|
|
expected_timestamps[1].clear();
|
|
|
|
expected_values[1].clear();
|
|
|
|
verify();
|
|
|
|
|
|
|
|
// snapshot and ts after tombstone
|
|
|
|
read_ts_str = Timestamp(kNum, 0); // (4, 0)
|
|
|
|
read_ts = read_ts_str;
|
|
|
|
read_opts.timestamp = &read_ts;
|
|
|
|
read_opts.snapshot = after_tombstone;
|
|
|
|
for (int i = 0; i < kNum; ++i) {
|
|
|
|
if (i == kRangeBegin) {
|
|
|
|
expected_status[i] = Status::NotFound();
|
|
|
|
expected_values[i].clear();
|
|
|
|
} else {
|
|
|
|
expected_status[i] = Status::OK();
|
|
|
|
expected_values[i] = "val" + std::to_string(i);
|
|
|
|
}
|
|
|
|
expected_timestamps[i] = Timestamp(i, 0);
|
|
|
|
}
|
|
|
|
verify();
|
|
|
|
|
|
|
|
db_->ReleaseSnapshot(before_tombstone);
|
|
|
|
db_->ReleaseSnapshot(after_tombstone);
|
|
|
|
Close();
|
|
|
|
}
|
2022-11-01 05:28:58 +00:00
|
|
|
|
|
|
|
TEST_F(DBBasicTestWithTimestamp, MergeBasic) {
|
|
|
|
Options options = GetDefaultOptions();
|
|
|
|
options.create_if_missing = true;
|
|
|
|
const size_t kTimestampSize = Timestamp(0, 0).size();
|
|
|
|
TestComparator test_cmp(kTimestampSize);
|
|
|
|
options.comparator = &test_cmp;
|
|
|
|
options.merge_operator = std::make_shared<StringAppendTESTOperator>('.');
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
const std::array<std::string, 3> write_ts_strs = {
|
|
|
|
Timestamp(100, 0), Timestamp(200, 0), Timestamp(300, 0)};
|
|
|
|
constexpr size_t kNumOfUniqKeys = 100;
|
|
|
|
ColumnFamilyHandle* default_cf = db_->DefaultColumnFamily();
|
|
|
|
|
|
|
|
for (size_t i = 0; i < write_ts_strs.size(); ++i) {
|
|
|
|
for (size_t j = 0; j < kNumOfUniqKeys; ++j) {
|
|
|
|
Status s;
|
|
|
|
if (i == 0) {
|
|
|
|
const std::string val = "v" + std::to_string(j) + "_0";
|
|
|
|
s = db_->Put(WriteOptions(), Key1(j), write_ts_strs[i], val);
|
|
|
|
} else {
|
|
|
|
const std::string merge_op = std::to_string(i);
|
|
|
|
s = db_->Merge(WriteOptions(), default_cf, Key1(j), write_ts_strs[i],
|
|
|
|
merge_op);
|
|
|
|
}
|
|
|
|
ASSERT_OK(s);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
std::array<std::string, 3> read_ts_strs = {
|
|
|
|
Timestamp(150, 0), Timestamp(250, 0), Timestamp(350, 0)};
|
|
|
|
|
|
|
|
const auto verify_db_with_get = [&]() {
|
|
|
|
for (size_t i = 0; i < kNumOfUniqKeys; ++i) {
|
|
|
|
const std::string base_val = "v" + std::to_string(i) + "_0";
|
|
|
|
const std::array<std::string, 3> expected_values = {
|
|
|
|
base_val, base_val + ".1", base_val + ".1.2"};
|
|
|
|
const std::array<std::string, 3>& expected_ts = write_ts_strs;
|
|
|
|
ReadOptions read_opts;
|
|
|
|
for (size_t j = 0; j < read_ts_strs.size(); ++j) {
|
|
|
|
Slice read_ts = read_ts_strs[j];
|
|
|
|
read_opts.timestamp = &read_ts;
|
|
|
|
std::string value;
|
|
|
|
std::string ts;
|
|
|
|
const Status s = db_->Get(read_opts, Key1(i), &value, &ts);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ(expected_values[j], value);
|
|
|
|
ASSERT_EQ(expected_ts[j], ts);
|
|
|
|
|
|
|
|
// Do Seek/SeekForPrev
|
|
|
|
std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
|
|
|
|
it->Seek(Key1(i));
|
|
|
|
ASSERT_TRUE(it->Valid());
|
|
|
|
ASSERT_EQ(expected_values[j], it->value());
|
|
|
|
ASSERT_EQ(expected_ts[j], it->timestamp());
|
|
|
|
|
|
|
|
it->SeekForPrev(Key1(i));
|
|
|
|
ASSERT_TRUE(it->Valid());
|
|
|
|
ASSERT_EQ(expected_values[j], it->value());
|
|
|
|
ASSERT_EQ(expected_ts[j], it->timestamp());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
const auto verify_db_with_iterator = [&]() {
|
|
|
|
std::string value_suffix;
|
|
|
|
for (size_t i = 0; i < read_ts_strs.size(); ++i) {
|
|
|
|
ReadOptions read_opts;
|
|
|
|
Slice read_ts = read_ts_strs[i];
|
|
|
|
read_opts.timestamp = &read_ts;
|
|
|
|
std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
|
|
|
|
size_t key_int_val = 0;
|
|
|
|
for (it->SeekToFirst(); it->Valid(); it->Next(), ++key_int_val) {
|
|
|
|
const std::string key = Key1(key_int_val);
|
|
|
|
const std::string value =
|
|
|
|
"v" + std::to_string(key_int_val) + "_0" + value_suffix;
|
|
|
|
ASSERT_EQ(key, it->key());
|
|
|
|
ASSERT_EQ(value, it->value());
|
|
|
|
ASSERT_EQ(write_ts_strs[i], it->timestamp());
|
|
|
|
}
|
2023-10-18 16:38:38 +00:00
|
|
|
EXPECT_OK(it->status());
|
2022-11-01 05:28:58 +00:00
|
|
|
ASSERT_EQ(kNumOfUniqKeys, key_int_val);
|
|
|
|
|
|
|
|
key_int_val = kNumOfUniqKeys - 1;
|
|
|
|
for (it->SeekToLast(); it->Valid(); it->Prev(), --key_int_val) {
|
|
|
|
const std::string key = Key1(key_int_val);
|
|
|
|
const std::string value =
|
|
|
|
"v" + std::to_string(key_int_val) + "_0" + value_suffix;
|
|
|
|
ASSERT_EQ(key, it->key());
|
|
|
|
ASSERT_EQ(value, it->value());
|
|
|
|
ASSERT_EQ(write_ts_strs[i], it->timestamp());
|
|
|
|
}
|
2023-10-18 16:38:38 +00:00
|
|
|
ASSERT_OK(it->status());
|
2022-11-01 05:28:58 +00:00
|
|
|
ASSERT_EQ(std::numeric_limits<size_t>::max(), key_int_val);
|
|
|
|
|
|
|
|
value_suffix = value_suffix + "." + std::to_string(i + 1);
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
verify_db_with_get();
|
|
|
|
verify_db_with_iterator();
|
|
|
|
|
|
|
|
ASSERT_OK(db_->Flush(FlushOptions()));
|
|
|
|
|
|
|
|
verify_db_with_get();
|
|
|
|
verify_db_with_iterator();
|
|
|
|
|
|
|
|
Close();
|
|
|
|
}
|
2022-11-03 20:02:06 +00:00
|
|
|
|
|
|
|
TEST_F(DBBasicTestWithTimestamp, MergeAfterDeletion) {
|
|
|
|
Options options = GetDefaultOptions();
|
|
|
|
options.create_if_missing = true;
|
|
|
|
const size_t kTimestampSize = Timestamp(0, 0).size();
|
|
|
|
TestComparator test_cmp(kTimestampSize);
|
|
|
|
options.comparator = &test_cmp;
|
|
|
|
options.merge_operator = std::make_shared<StringAppendTESTOperator>('.');
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
ColumnFamilyHandle* const column_family = db_->DefaultColumnFamily();
|
|
|
|
|
|
|
|
const size_t num_keys_per_file = 10;
|
|
|
|
const size_t num_merges_per_key = 2;
|
|
|
|
for (size_t i = 0; i < num_keys_per_file; ++i) {
|
|
|
|
std::string ts = Timestamp(i + 10000, 0);
|
|
|
|
Status s = db_->Delete(WriteOptions(), Key1(i), ts);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
for (size_t j = 1; j <= num_merges_per_key; ++j) {
|
|
|
|
ts = Timestamp(i + 10000 + j, 0);
|
|
|
|
s = db_->Merge(WriteOptions(), column_family, Key1(i), ts,
|
|
|
|
std::to_string(j));
|
|
|
|
ASSERT_OK(s);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
const auto verify_db = [&]() {
|
|
|
|
ReadOptions read_opts;
|
|
|
|
std::string read_ts_str = Timestamp(20000, 0);
|
|
|
|
Slice ts = read_ts_str;
|
|
|
|
read_opts.timestamp = &ts;
|
|
|
|
std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
|
|
|
|
size_t count = 0;
|
|
|
|
for (it->SeekToFirst(); it->Valid(); it->Next(), ++count) {
|
|
|
|
std::string key = Key1(count);
|
|
|
|
ASSERT_EQ(key, it->key());
|
|
|
|
std::string value;
|
|
|
|
for (size_t j = 1; j <= num_merges_per_key; ++j) {
|
|
|
|
value.append(std::to_string(j));
|
|
|
|
if (j < num_merges_per_key) {
|
|
|
|
value.push_back('.');
|
|
|
|
}
|
|
|
|
}
|
|
|
|
ASSERT_EQ(value, it->value());
|
|
|
|
std::string ts1 = Timestamp(count + 10000 + num_merges_per_key, 0);
|
|
|
|
ASSERT_EQ(ts1, it->timestamp());
|
|
|
|
}
|
|
|
|
ASSERT_OK(it->status());
|
|
|
|
ASSERT_EQ(num_keys_per_file, count);
|
|
|
|
for (it->SeekToLast(); it->Valid(); it->Prev(), --count) {
|
|
|
|
std::string key = Key1(count - 1);
|
|
|
|
ASSERT_EQ(key, it->key());
|
|
|
|
std::string value;
|
|
|
|
for (size_t j = 1; j <= num_merges_per_key; ++j) {
|
|
|
|
value.append(std::to_string(j));
|
|
|
|
if (j < num_merges_per_key) {
|
|
|
|
value.push_back('.');
|
|
|
|
}
|
|
|
|
}
|
|
|
|
ASSERT_EQ(value, it->value());
|
|
|
|
std::string ts1 = Timestamp(count - 1 + 10000 + num_merges_per_key, 0);
|
|
|
|
ASSERT_EQ(ts1, it->timestamp());
|
|
|
|
}
|
|
|
|
ASSERT_OK(it->status());
|
|
|
|
ASSERT_EQ(0, count);
|
|
|
|
};
|
|
|
|
|
|
|
|
verify_db();
|
|
|
|
|
|
|
|
Close();
|
|
|
|
}
|
2023-01-19 00:38:07 +00:00
|
|
|
|
|
|
|
TEST_F(DBBasicTestWithTimestamp, RangeTombstoneApproximateSize) {
|
|
|
|
// Test code path for calculating range tombstone compensated size
|
|
|
|
// during flush and compaction.
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
const size_t kTimestampSize = Timestamp(0, 0).size();
|
|
|
|
TestComparator test_cmp(kTimestampSize);
|
|
|
|
options.comparator = &test_cmp;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
// So that the compaction below is non-bottommost and will calcualte
|
|
|
|
// compensated range tombstone size.
|
|
|
|
ASSERT_OK(db_->Put(WriteOptions(), Key(1), Timestamp(1, 0), "val"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
MoveFilesToLevel(5);
|
|
|
|
ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
|
|
|
|
Key(1), Timestamp(1, 0)));
|
|
|
|
ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(1),
|
|
|
|
Key(2), Timestamp(2, 0)));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(dbfull()->RunManualCompaction(
|
|
|
|
static_cast_with_check<ColumnFamilyHandleImpl>(db_->DefaultColumnFamily())
|
|
|
|
->cfd(),
|
|
|
|
0 /* input_level */, 1 /* output_level */, CompactRangeOptions(),
|
|
|
|
nullptr /* begin */, nullptr /* end */, true /* exclusive */,
|
|
|
|
true /* disallow_trivial_move */,
|
|
|
|
std::numeric_limits<uint64_t>::max() /* max_file_num_to_ignore */,
|
|
|
|
"" /*trim_ts*/));
|
|
|
|
}
|
2023-02-21 19:57:58 +00:00
|
|
|
|
|
|
|
TEST_F(DBBasicTestWithTimestamp, IterSeekToLastWithIterateUpperbound) {
|
|
|
|
// Test for a bug fix where DBIter::SeekToLast() could fail when
|
|
|
|
// iterate_upper_bound and iter_start_ts are both set.
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
const size_t kTimestampSize = Timestamp(0, 0).size();
|
|
|
|
TestComparator test_cmp(kTimestampSize);
|
|
|
|
options.comparator = &test_cmp;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
ASSERT_OK(db_->Put(WriteOptions(), Key(1), Timestamp(2, 0), "val"));
|
|
|
|
ReadOptions ro;
|
|
|
|
std::string k = Key(1);
|
|
|
|
Slice k_slice = k;
|
|
|
|
ro.iterate_upper_bound = &k_slice;
|
|
|
|
std::string ts = Timestamp(3, 0);
|
|
|
|
Slice read_ts = ts;
|
|
|
|
ro.timestamp = &read_ts;
|
|
|
|
std::string start_ts = Timestamp(0, 0);
|
|
|
|
Slice start_ts_slice = start_ts;
|
|
|
|
ro.iter_start_ts = &start_ts_slice;
|
|
|
|
std::unique_ptr<Iterator> iter{db_->NewIterator(ro)};
|
|
|
|
iter->SeekToLast();
|
|
|
|
ASSERT_FALSE(iter->Valid());
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
}
|
2023-04-06 22:39:38 +00:00
|
|
|
|
|
|
|
TEST_F(DBBasicTestWithTimestamp, TimestampFilterTableReadOnGet) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.env = env_;
|
|
|
|
options.create_if_missing = true;
|
|
|
|
options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
|
|
|
|
const size_t kTimestampSize = Timestamp(0, 0).size();
|
|
|
|
TestComparator test_cmp(kTimestampSize);
|
|
|
|
options.comparator = &test_cmp;
|
|
|
|
BlockBasedTableOptions bbto;
|
|
|
|
bbto.block_size = 100;
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(bbto));
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
// Put
|
|
|
|
// Create two SST files
|
|
|
|
// file1: key => [1, 3], timestamp => [10, 20]
|
|
|
|
// file2, key => [2, 4], timestamp => [30, 40]
|
|
|
|
{
|
|
|
|
WriteOptions write_opts;
|
|
|
|
std::string write_ts = Timestamp(10, 0);
|
|
|
|
ASSERT_OK(db_->Put(write_opts, Key1(1), write_ts, "value1"));
|
|
|
|
write_ts = Timestamp(20, 0);
|
|
|
|
ASSERT_OK(db_->Put(write_opts, Key1(3), write_ts, "value3"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
write_ts = Timestamp(30, 0);
|
|
|
|
ASSERT_OK(db_->Put(write_opts, Key1(2), write_ts, "value2"));
|
|
|
|
write_ts = Timestamp(40, 0);
|
|
|
|
ASSERT_OK(db_->Put(write_opts, Key1(4), write_ts, "value4"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
|
|
|
|
|
|
|
// Get with timestamp
|
|
|
|
{
|
|
|
|
auto prev_checked_events = options.statistics->getTickerCount(
|
|
|
|
Tickers::TIMESTAMP_FILTER_TABLE_CHECKED);
|
|
|
|
auto prev_filtered_events = options.statistics->getTickerCount(
|
|
|
|
Tickers::TIMESTAMP_FILTER_TABLE_FILTERED);
|
|
|
|
|
|
|
|
// key=3 (ts=20) does not exist at timestamp=1
|
|
|
|
std::string read_ts_str = Timestamp(1, 0);
|
|
|
|
Slice read_ts_slice = Slice(read_ts_str);
|
|
|
|
ReadOptions read_opts;
|
|
|
|
read_opts.timestamp = &read_ts_slice;
|
2024-01-05 19:53:57 +00:00
|
|
|
std::string value_from_get;
|
|
|
|
std::string timestamp_from_get;
|
2023-04-06 22:39:38 +00:00
|
|
|
auto status =
|
|
|
|
db_->Get(read_opts, Key1(3), &value_from_get, ×tamp_from_get);
|
|
|
|
ASSERT_TRUE(status.IsNotFound());
|
|
|
|
ASSERT_EQ(value_from_get, std::string(""));
|
|
|
|
ASSERT_EQ(timestamp_from_get, std::string(""));
|
|
|
|
|
|
|
|
// key=3 is in the key ranges for both files, so both files will be queried.
|
|
|
|
// The table read was skipped because the timestamp is out of the table
|
|
|
|
// range, i.e.., 1 < [10,20], [30,40].
|
|
|
|
// The tickers increase by 2 due to 2 files.
|
|
|
|
ASSERT_EQ(prev_checked_events + 2,
|
|
|
|
options.statistics->getTickerCount(
|
|
|
|
Tickers::TIMESTAMP_FILTER_TABLE_CHECKED));
|
|
|
|
ASSERT_EQ(prev_filtered_events + 2,
|
|
|
|
options.statistics->getTickerCount(
|
|
|
|
Tickers::TIMESTAMP_FILTER_TABLE_FILTERED));
|
|
|
|
|
|
|
|
// key=3 (ts=20) exists at timestamp = 25
|
|
|
|
read_ts_str = Timestamp(25, 0);
|
|
|
|
read_ts_slice = Slice(read_ts_str);
|
|
|
|
read_opts.timestamp = &read_ts_slice;
|
|
|
|
ASSERT_OK(
|
|
|
|
db_->Get(read_opts, Key1(3), &value_from_get, ×tamp_from_get));
|
|
|
|
ASSERT_EQ("value3", value_from_get);
|
|
|
|
ASSERT_EQ(Timestamp(20, 0), timestamp_from_get);
|
|
|
|
|
|
|
|
// file1 was not skipped, because the timestamp is in range, [10,20] < 25.
|
|
|
|
// file2 was skipped, because the timestamp is not in range, 25 < [30,40].
|
|
|
|
// So the checked ticker increase by 2 due to 2 files;
|
|
|
|
// filtered ticker increase by 1 because file2 was skipped
|
|
|
|
ASSERT_EQ(prev_checked_events + 4,
|
|
|
|
options.statistics->getTickerCount(
|
|
|
|
Tickers::TIMESTAMP_FILTER_TABLE_CHECKED));
|
|
|
|
ASSERT_EQ(prev_filtered_events + 3,
|
|
|
|
options.statistics->getTickerCount(
|
|
|
|
Tickers::TIMESTAMP_FILTER_TABLE_FILTERED));
|
|
|
|
}
|
|
|
|
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
2020-03-13 18:33:04 +00:00
|
|
|
} // namespace ROCKSDB_NAMESPACE
|
|
|
|
|
|
|
|
int main(int argc, char** argv) {
|
|
|
|
ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
|
|
|
|
::testing::InitGoogleTest(&argc, argv);
|
|
|
|
RegisterCustomObjects(argc, argv);
|
|
|
|
return RUN_ALL_TESTS();
|
2024-02-16 17:21:06 +00:00
|
|
|
}
|