2016-02-09 23:12:00 +00:00
|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
2017-07-15 23:03:42 +00:00
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
2013-10-16 21:59:46 +00:00
|
|
|
//
|
2011-03-18 22:37:00 +00:00
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
//
|
|
|
|
// WriteBatch::rep_ :=
|
|
|
|
// sequence: fixed64
|
|
|
|
// count: fixed32
|
|
|
|
// data: record[count]
|
|
|
|
// record :=
|
2013-03-21 22:59:47 +00:00
|
|
|
// kTypeValue varstring varstring
|
2011-03-18 22:37:00 +00:00
|
|
|
// kTypeDeletion varstring
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 18:42:56 +00:00
|
|
|
// kTypeSingleDeletion varstring
|
2018-01-23 20:46:12 +00:00
|
|
|
// kTypeRangeDeletion varstring varstring
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 18:42:56 +00:00
|
|
|
// kTypeMerge varstring varstring
|
2014-01-07 22:41:42 +00:00
|
|
|
// kTypeColumnFamilyValue varint32 varstring varstring
|
2018-01-23 20:46:12 +00:00
|
|
|
// kTypeColumnFamilyDeletion varint32 varstring
|
|
|
|
// kTypeColumnFamilySingleDeletion varint32 varstring
|
|
|
|
// kTypeColumnFamilyRangeDeletion varint32 varstring varstring
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 18:42:56 +00:00
|
|
|
// kTypeColumnFamilyMerge varint32 varstring varstring
|
2022-04-01 17:30:17 +00:00
|
|
|
// kTypeBeginPrepareXID
|
|
|
|
// kTypeEndPrepareXID varstring
|
Modification of WriteBatch to support two phase commit
Summary: Adds three new WriteBatch data types: Prepare(xid), Commit(xid), Rollback(xid). Prepare(xid) should precede the (single) operation to which is applies. There can obviously be multiple Prepare(xid) markers. There should only be one Rollback(xid) or Commit(xid) marker yet not both. None of this logic is currently enforced and will most likely be implemented further up such as in the memtableinserter. All three markers are similar to PutLogData in that they are writebatch meta-data, ie stored but not counted. All three markers differ from PutLogData in that they will actually be written to disk. As for WriteBatchWithIndex, Prepare, Commit, Rollback are all implemented just as PutLogData and none are tested just as PutLogData.
Test Plan: single unit test in write_batch_test.
Reviewers: hermanlee4, sdong, anthony
Subscribers: leveldb, dhruba, vasilep, andrewkr
Differential Revision: https://reviews.facebook.net/D57867
2016-04-08 06:35:51 +00:00
|
|
|
// kTypeCommitXID varstring
|
2022-04-01 17:30:17 +00:00
|
|
|
// kTypeCommitXIDAndTimestamp varstring varstring
|
Modification of WriteBatch to support two phase commit
Summary: Adds three new WriteBatch data types: Prepare(xid), Commit(xid), Rollback(xid). Prepare(xid) should precede the (single) operation to which is applies. There can obviously be multiple Prepare(xid) markers. There should only be one Rollback(xid) or Commit(xid) marker yet not both. None of this logic is currently enforced and will most likely be implemented further up such as in the memtableinserter. All three markers are similar to PutLogData in that they are writebatch meta-data, ie stored but not counted. All three markers differ from PutLogData in that they will actually be written to disk. As for WriteBatchWithIndex, Prepare, Commit, Rollback are all implemented just as PutLogData and none are tested just as PutLogData.
Test Plan: single unit test in write_batch_test.
Reviewers: hermanlee4, sdong, anthony
Subscribers: leveldb, dhruba, vasilep, andrewkr
Differential Revision: https://reviews.facebook.net/D57867
2016-04-08 06:35:51 +00:00
|
|
|
// kTypeRollbackXID varstring
|
2022-04-01 17:30:17 +00:00
|
|
|
// kTypeBeginPersistedPrepareXID
|
|
|
|
// kTypeBeginUnprepareXID
|
2022-06-25 22:30:47 +00:00
|
|
|
// kTypeWideColumnEntity varstring varstring
|
|
|
|
// kTypeColumnFamilyWideColumnEntity varint32 varstring varstring
|
Modification of WriteBatch to support two phase commit
Summary: Adds three new WriteBatch data types: Prepare(xid), Commit(xid), Rollback(xid). Prepare(xid) should precede the (single) operation to which is applies. There can obviously be multiple Prepare(xid) markers. There should only be one Rollback(xid) or Commit(xid) marker yet not both. None of this logic is currently enforced and will most likely be implemented further up such as in the memtableinserter. All three markers are similar to PutLogData in that they are writebatch meta-data, ie stored but not counted. All three markers differ from PutLogData in that they will actually be written to disk. As for WriteBatchWithIndex, Prepare, Commit, Rollback are all implemented just as PutLogData and none are tested just as PutLogData.
Test Plan: single unit test in write_batch_test.
Reviewers: hermanlee4, sdong, anthony
Subscribers: leveldb, dhruba, vasilep, andrewkr
Differential Revision: https://reviews.facebook.net/D57867
2016-04-08 06:35:51 +00:00
|
|
|
// kTypeNoop
|
2011-03-18 22:37:00 +00:00
|
|
|
// varstring :=
|
|
|
|
// len: varint32
|
|
|
|
// data: uint8[len]
|
|
|
|
|
2013-08-23 15:38:13 +00:00
|
|
|
#include "rocksdb/write_batch.h"
|
2015-07-11 03:15:45 +00:00
|
|
|
|
2022-06-25 22:30:47 +00:00
|
|
|
#include <algorithm>
|
2023-11-07 00:52:51 +00:00
|
|
|
#include <cstdint>
|
2022-06-25 22:30:47 +00:00
|
|
|
#include <limits>
|
2016-07-07 21:45:29 +00:00
|
|
|
#include <map>
|
2015-07-11 03:15:45 +00:00
|
|
|
#include <stack>
|
|
|
|
#include <stdexcept>
|
2017-03-22 18:07:52 +00:00
|
|
|
#include <type_traits>
|
2019-09-12 23:53:31 +00:00
|
|
|
#include <unordered_map>
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-14 23:59:07 +00:00
|
|
|
#include <vector>
|
2015-07-11 03:15:45 +00:00
|
|
|
|
2014-08-18 22:19:17 +00:00
|
|
|
#include "db/column_family.h"
|
2019-05-31 22:21:36 +00:00
|
|
|
#include "db/db_impl/db_impl.h"
|
2015-09-02 20:58:22 +00:00
|
|
|
#include "db/dbformat.h"
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-14 23:59:07 +00:00
|
|
|
#include "db/flush_scheduler.h"
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
#include "db/kv_checksum.h"
|
2011-03-18 22:37:00 +00:00
|
|
|
#include "db/memtable.h"
|
2016-06-13 23:17:26 +00:00
|
|
|
#include "db/merge_context.h"
|
2015-08-07 00:59:05 +00:00
|
|
|
#include "db/snapshot_impl.h"
|
Refactor trimming logic for immutable memtables (#5022)
Summary:
MyRocks currently sets `max_write_buffer_number_to_maintain` in order to maintain enough history for transaction conflict checking. The effectiveness of this approach depends on the size of memtables. When memtables are small, it may not keep enough history; when memtables are large, this may consume too much memory.
We are proposing a new way to configure memtable list history: by limiting the memory usage of immutable memtables. The new option is `max_write_buffer_size_to_maintain` and it will take precedence over the old `max_write_buffer_number_to_maintain` if they are both set to non-zero values. The new option accounts for the total memory usage of flushed immutable memtables and mutable memtable. When the total usage exceeds the limit, RocksDB may start dropping immutable memtables (which is also called trimming history), starting from the oldest one.
The semantics of the old option actually works both as an upper bound and lower bound. History trimming will start if number of immutable memtables exceeds the limit, but it will never go below (limit-1) due to history trimming.
In order the mimic the behavior with the new option, history trimming will stop if dropping the next immutable memtable causes the total memory usage go below the size limit. For example, assuming the size limit is set to 64MB, and there are 3 immutable memtables with sizes of 20, 30, 30. Although the total memory usage is 80MB > 64MB, dropping the oldest memtable will reduce the memory usage to 60MB < 64MB, so in this case no memtable will be dropped.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5022
Differential Revision: D14394062
Pulled By: miasantreble
fbshipit-source-id: 60457a509c6af89d0993f988c9b5c2aa9e45f5c5
2019-08-23 20:54:09 +00:00
|
|
|
#include "db/trim_history_scheduler.h"
|
2022-06-25 22:30:47 +00:00
|
|
|
#include "db/wide/wide_column_serialization.h"
|
2023-09-12 19:36:07 +00:00
|
|
|
#include "db/wide/wide_columns_helper.h"
|
2011-03-18 22:37:00 +00:00
|
|
|
#include "db/write_batch_internal.h"
|
2017-04-06 02:02:00 +00:00
|
|
|
#include "monitoring/perf_context_imp.h"
|
2023-05-17 18:27:09 +00:00
|
|
|
#include "monitoring/statistics_impl.h"
|
2020-04-20 20:21:34 +00:00
|
|
|
#include "port/lang.h"
|
2015-09-02 20:58:22 +00:00
|
|
|
#include "rocksdb/merge_operator.h"
|
2021-01-26 06:07:26 +00:00
|
|
|
#include "rocksdb/system_clock.h"
|
2024-05-28 22:33:57 +00:00
|
|
|
#include "util/aligned_storage.h"
|
refactor SavePoints (#5192)
Summary:
Savepoints are assumed to be used in a stack-wise fashion (only
the top element should be used), so they were stored by `WriteBatch`
in a member variable `save_points` using an std::stack.
Conceptually this is fine, but the implementation had a few issues:
- the `save_points_` instance variable was a plain pointer to a heap-
allocated `SavePoints` struct. The destructor of `WriteBatch` simply
deletes this pointer. However, the copy constructor of WriteBatch
just copied that pointer, meaning that copying a WriteBatch with
active savepoints will very likely have crashed before. Now a proper
copy of the savepoints is made in the copy constructor, and not just
a copy of the pointer
- `save_points_` was an std::stack, which defaults to `std::deque` for
the underlying container. A deque is a bit over the top here, as we
only need access to the most recent savepoint (i.e. stack.top()) but
never any elements at the front. std::deque is rather expensive to
initialize in common environments. For example, the STL implementation
shipped with GNU g++ will perform a heap allocation of more than 500
bytes to create an empty deque object. Although the `save_points_`
container is created lazily by RocksDB, moving from a deque to a plain
`std::vector` is much more memory-efficient. So `save_points_` is now
a vector.
- `save_points_` was changed from a plain pointer to an `std::unique_ptr`,
making ownership more explicit.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5192
Differential Revision: D15024074
Pulled By: maysamyabandeh
fbshipit-source-id: 5b128786d3789cde94e46465c9e91badd07a25d7
2019-04-20 03:30:03 +00:00
|
|
|
#include "util/autovector.h"
|
2019-08-27 17:57:28 +00:00
|
|
|
#include "util/cast_util.h"
|
2011-03-18 22:37:00 +00:00
|
|
|
#include "util/coding.h"
|
2018-03-06 07:48:23 +00:00
|
|
|
#include "util/duplicate_detector.h"
|
2016-11-15 23:18:56 +00:00
|
|
|
#include "util/string_util.h"
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2020-02-20 20:07:53 +00:00
|
|
|
namespace ROCKSDB_NAMESPACE {
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2015-11-06 15:03:30 +00:00
|
|
|
// anon namespace for file-local types
|
|
|
|
namespace {
|
|
|
|
|
|
|
|
enum ContentFlags : uint32_t {
|
Modification of WriteBatch to support two phase commit
Summary: Adds three new WriteBatch data types: Prepare(xid), Commit(xid), Rollback(xid). Prepare(xid) should precede the (single) operation to which is applies. There can obviously be multiple Prepare(xid) markers. There should only be one Rollback(xid) or Commit(xid) marker yet not both. None of this logic is currently enforced and will most likely be implemented further up such as in the memtableinserter. All three markers are similar to PutLogData in that they are writebatch meta-data, ie stored but not counted. All three markers differ from PutLogData in that they will actually be written to disk. As for WriteBatchWithIndex, Prepare, Commit, Rollback are all implemented just as PutLogData and none are tested just as PutLogData.
Test Plan: single unit test in write_batch_test.
Reviewers: hermanlee4, sdong, anthony
Subscribers: leveldb, dhruba, vasilep, andrewkr
Differential Revision: https://reviews.facebook.net/D57867
2016-04-08 06:35:51 +00:00
|
|
|
DEFERRED = 1 << 0,
|
|
|
|
HAS_PUT = 1 << 1,
|
|
|
|
HAS_DELETE = 1 << 2,
|
|
|
|
HAS_SINGLE_DELETE = 1 << 3,
|
|
|
|
HAS_MERGE = 1 << 4,
|
|
|
|
HAS_BEGIN_PREPARE = 1 << 5,
|
|
|
|
HAS_END_PREPARE = 1 << 6,
|
|
|
|
HAS_COMMIT = 1 << 7,
|
|
|
|
HAS_ROLLBACK = 1 << 8,
|
2016-08-16 15:16:04 +00:00
|
|
|
HAS_DELETE_RANGE = 1 << 9,
|
2017-10-03 16:08:07 +00:00
|
|
|
HAS_BLOB_INDEX = 1 << 10,
|
2018-07-07 00:17:36 +00:00
|
|
|
HAS_BEGIN_UNPREPARE = 1 << 11,
|
2022-06-25 22:30:47 +00:00
|
|
|
HAS_PUT_ENTITY = 1 << 12,
|
Add initial support for TimedPut API (#12419)
Summary:
This PR adds support for `TimedPut` API. We introduced a new type `kTypeValuePreferredSeqno` for entries added to the DB via the `TimedPut` API.
The life cycle of such an entry on the write/flush/compaction paths are:
1) It is initially added to memtable as:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, write_unix_time}`
2) When it's flushed to L0 sst files, it's converted to:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, preferred_seqno}`
when we have easy access to the seqno to time mapping.
3) During compaction, if certain conditions are met, we swap in the `preferred_seqno` and the entry will become:
`<user_key, preferred_seqno, kTypeValue>: value`. This step helps fast track these entries to the cold tier if they are eligible after the sequence number swap.
On the read path:
A `kTypeValuePreferredSeqno` entry acts the same as a `kTypeValue` entry, the unix_write_time/preferred seqno part packed in value is completely ignored.
Needed follow ups:
1) The seqno to time mapping accessible in flush needs to be extended to cover the `write_unix_time` for possible `kTypeValuePreferredSeqno` entries. This also means we need to track these `write_unix_time` in memtable.
2) Compaction filter support for the new `kTypeValuePreferredSeqno` type for feature parity with other `kTypeValue` and equivalent types.
3) Stress test coverage for the feature
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12419
Test Plan: Added unit tests
Reviewed By: pdillinger
Differential Revision: D54920296
Pulled By: jowlyzhang
fbshipit-source-id: c8b43f7a7c465e569141770e93c748371ff1da9e
2024-03-14 22:44:55 +00:00
|
|
|
HAS_TIMED_PUT = 1 << 13,
|
2015-11-06 15:03:30 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
struct BatchContentClassifier : public WriteBatch::Handler {
|
|
|
|
uint32_t content_flags = 0;
|
|
|
|
|
|
|
|
Status PutCF(uint32_t, const Slice&, const Slice&) override {
|
|
|
|
content_flags |= ContentFlags::HAS_PUT;
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
Add initial support for TimedPut API (#12419)
Summary:
This PR adds support for `TimedPut` API. We introduced a new type `kTypeValuePreferredSeqno` for entries added to the DB via the `TimedPut` API.
The life cycle of such an entry on the write/flush/compaction paths are:
1) It is initially added to memtable as:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, write_unix_time}`
2) When it's flushed to L0 sst files, it's converted to:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, preferred_seqno}`
when we have easy access to the seqno to time mapping.
3) During compaction, if certain conditions are met, we swap in the `preferred_seqno` and the entry will become:
`<user_key, preferred_seqno, kTypeValue>: value`. This step helps fast track these entries to the cold tier if they are eligible after the sequence number swap.
On the read path:
A `kTypeValuePreferredSeqno` entry acts the same as a `kTypeValue` entry, the unix_write_time/preferred seqno part packed in value is completely ignored.
Needed follow ups:
1) The seqno to time mapping accessible in flush needs to be extended to cover the `write_unix_time` for possible `kTypeValuePreferredSeqno` entries. This also means we need to track these `write_unix_time` in memtable.
2) Compaction filter support for the new `kTypeValuePreferredSeqno` type for feature parity with other `kTypeValue` and equivalent types.
3) Stress test coverage for the feature
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12419
Test Plan: Added unit tests
Reviewed By: pdillinger
Differential Revision: D54920296
Pulled By: jowlyzhang
fbshipit-source-id: c8b43f7a7c465e569141770e93c748371ff1da9e
2024-03-14 22:44:55 +00:00
|
|
|
Status TimedPutCF(uint32_t, const Slice&, const Slice&, uint64_t) override {
|
|
|
|
content_flags |= ContentFlags::HAS_TIMED_PUT;
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
2022-06-25 22:30:47 +00:00
|
|
|
Status PutEntityCF(uint32_t /* column_family_id */, const Slice& /* key */,
|
|
|
|
const Slice& /* entity */) override {
|
|
|
|
content_flags |= ContentFlags::HAS_PUT_ENTITY;
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
2015-11-06 15:03:30 +00:00
|
|
|
Status DeleteCF(uint32_t, const Slice&) override {
|
|
|
|
content_flags |= ContentFlags::HAS_DELETE;
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
Status SingleDeleteCF(uint32_t, const Slice&) override {
|
|
|
|
content_flags |= ContentFlags::HAS_SINGLE_DELETE;
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
2016-08-16 15:16:04 +00:00
|
|
|
Status DeleteRangeCF(uint32_t, const Slice&, const Slice&) override {
|
|
|
|
content_flags |= ContentFlags::HAS_DELETE_RANGE;
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
2015-11-06 15:03:30 +00:00
|
|
|
Status MergeCF(uint32_t, const Slice&, const Slice&) override {
|
|
|
|
content_flags |= ContentFlags::HAS_MERGE;
|
|
|
|
return Status::OK();
|
|
|
|
}
|
Modification of WriteBatch to support two phase commit
Summary: Adds three new WriteBatch data types: Prepare(xid), Commit(xid), Rollback(xid). Prepare(xid) should precede the (single) operation to which is applies. There can obviously be multiple Prepare(xid) markers. There should only be one Rollback(xid) or Commit(xid) marker yet not both. None of this logic is currently enforced and will most likely be implemented further up such as in the memtableinserter. All three markers are similar to PutLogData in that they are writebatch meta-data, ie stored but not counted. All three markers differ from PutLogData in that they will actually be written to disk. As for WriteBatchWithIndex, Prepare, Commit, Rollback are all implemented just as PutLogData and none are tested just as PutLogData.
Test Plan: single unit test in write_batch_test.
Reviewers: hermanlee4, sdong, anthony
Subscribers: leveldb, dhruba, vasilep, andrewkr
Differential Revision: https://reviews.facebook.net/D57867
2016-04-08 06:35:51 +00:00
|
|
|
|
2017-10-03 16:08:07 +00:00
|
|
|
Status PutBlobIndexCF(uint32_t, const Slice&, const Slice&) override {
|
|
|
|
content_flags |= ContentFlags::HAS_BLOB_INDEX;
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
2018-07-07 00:17:36 +00:00
|
|
|
Status MarkBeginPrepare(bool unprepare) override {
|
Modification of WriteBatch to support two phase commit
Summary: Adds three new WriteBatch data types: Prepare(xid), Commit(xid), Rollback(xid). Prepare(xid) should precede the (single) operation to which is applies. There can obviously be multiple Prepare(xid) markers. There should only be one Rollback(xid) or Commit(xid) marker yet not both. None of this logic is currently enforced and will most likely be implemented further up such as in the memtableinserter. All three markers are similar to PutLogData in that they are writebatch meta-data, ie stored but not counted. All three markers differ from PutLogData in that they will actually be written to disk. As for WriteBatchWithIndex, Prepare, Commit, Rollback are all implemented just as PutLogData and none are tested just as PutLogData.
Test Plan: single unit test in write_batch_test.
Reviewers: hermanlee4, sdong, anthony
Subscribers: leveldb, dhruba, vasilep, andrewkr
Differential Revision: https://reviews.facebook.net/D57867
2016-04-08 06:35:51 +00:00
|
|
|
content_flags |= ContentFlags::HAS_BEGIN_PREPARE;
|
2018-07-07 00:17:36 +00:00
|
|
|
if (unprepare) {
|
|
|
|
content_flags |= ContentFlags::HAS_BEGIN_UNPREPARE;
|
|
|
|
}
|
Modification of WriteBatch to support two phase commit
Summary: Adds three new WriteBatch data types: Prepare(xid), Commit(xid), Rollback(xid). Prepare(xid) should precede the (single) operation to which is applies. There can obviously be multiple Prepare(xid) markers. There should only be one Rollback(xid) or Commit(xid) marker yet not both. None of this logic is currently enforced and will most likely be implemented further up such as in the memtableinserter. All three markers are similar to PutLogData in that they are writebatch meta-data, ie stored but not counted. All three markers differ from PutLogData in that they will actually be written to disk. As for WriteBatchWithIndex, Prepare, Commit, Rollback are all implemented just as PutLogData and none are tested just as PutLogData.
Test Plan: single unit test in write_batch_test.
Reviewers: hermanlee4, sdong, anthony
Subscribers: leveldb, dhruba, vasilep, andrewkr
Differential Revision: https://reviews.facebook.net/D57867
2016-04-08 06:35:51 +00:00
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
Status MarkEndPrepare(const Slice&) override {
|
|
|
|
content_flags |= ContentFlags::HAS_END_PREPARE;
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
Status MarkCommit(const Slice&) override {
|
|
|
|
content_flags |= ContentFlags::HAS_COMMIT;
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
2021-12-10 19:03:39 +00:00
|
|
|
Status MarkCommitWithTimestamp(const Slice&, const Slice&) override {
|
|
|
|
content_flags |= ContentFlags::HAS_COMMIT;
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
Modification of WriteBatch to support two phase commit
Summary: Adds three new WriteBatch data types: Prepare(xid), Commit(xid), Rollback(xid). Prepare(xid) should precede the (single) operation to which is applies. There can obviously be multiple Prepare(xid) markers. There should only be one Rollback(xid) or Commit(xid) marker yet not both. None of this logic is currently enforced and will most likely be implemented further up such as in the memtableinserter. All three markers are similar to PutLogData in that they are writebatch meta-data, ie stored but not counted. All three markers differ from PutLogData in that they will actually be written to disk. As for WriteBatchWithIndex, Prepare, Commit, Rollback are all implemented just as PutLogData and none are tested just as PutLogData.
Test Plan: single unit test in write_batch_test.
Reviewers: hermanlee4, sdong, anthony
Subscribers: leveldb, dhruba, vasilep, andrewkr
Differential Revision: https://reviews.facebook.net/D57867
2016-04-08 06:35:51 +00:00
|
|
|
Status MarkRollback(const Slice&) override {
|
|
|
|
content_flags |= ContentFlags::HAS_ROLLBACK;
|
|
|
|
return Status::OK();
|
|
|
|
}
|
2015-11-06 15:03:30 +00:00
|
|
|
};
|
|
|
|
|
2022-11-02 21:34:24 +00:00
|
|
|
} // anonymous namespace
|
2015-11-06 15:03:30 +00:00
|
|
|
|
2015-07-11 03:15:45 +00:00
|
|
|
struct SavePoints {
|
refactor SavePoints (#5192)
Summary:
Savepoints are assumed to be used in a stack-wise fashion (only
the top element should be used), so they were stored by `WriteBatch`
in a member variable `save_points` using an std::stack.
Conceptually this is fine, but the implementation had a few issues:
- the `save_points_` instance variable was a plain pointer to a heap-
allocated `SavePoints` struct. The destructor of `WriteBatch` simply
deletes this pointer. However, the copy constructor of WriteBatch
just copied that pointer, meaning that copying a WriteBatch with
active savepoints will very likely have crashed before. Now a proper
copy of the savepoints is made in the copy constructor, and not just
a copy of the pointer
- `save_points_` was an std::stack, which defaults to `std::deque` for
the underlying container. A deque is a bit over the top here, as we
only need access to the most recent savepoint (i.e. stack.top()) but
never any elements at the front. std::deque is rather expensive to
initialize in common environments. For example, the STL implementation
shipped with GNU g++ will perform a heap allocation of more than 500
bytes to create an empty deque object. Although the `save_points_`
container is created lazily by RocksDB, moving from a deque to a plain
`std::vector` is much more memory-efficient. So `save_points_` is now
a vector.
- `save_points_` was changed from a plain pointer to an `std::unique_ptr`,
making ownership more explicit.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5192
Differential Revision: D15024074
Pulled By: maysamyabandeh
fbshipit-source-id: 5b128786d3789cde94e46465c9e91badd07a25d7
2019-04-20 03:30:03 +00:00
|
|
|
std::stack<SavePoint, autovector<SavePoint>> stack;
|
2015-07-11 03:15:45 +00:00
|
|
|
};
|
|
|
|
|
2021-09-12 22:33:15 +00:00
|
|
|
WriteBatch::WriteBatch(size_t reserved_bytes, size_t max_bytes,
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
size_t protection_bytes_per_key, size_t default_cf_ts_sz)
|
|
|
|
: content_flags_(0),
|
|
|
|
max_bytes_(max_bytes),
|
|
|
|
default_cf_ts_sz_(default_cf_ts_sz),
|
|
|
|
rep_() {
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
// Currently `protection_bytes_per_key` can only be enabled at 8 bytes per
|
|
|
|
// entry.
|
|
|
|
assert(protection_bytes_per_key == 0 || protection_bytes_per_key == 8);
|
|
|
|
if (protection_bytes_per_key != 0) {
|
|
|
|
prot_info_.reset(new WriteBatch::ProtectionInfo());
|
|
|
|
}
|
|
|
|
rep_.reserve((reserved_bytes > WriteBatchInternal::kHeader)
|
|
|
|
? reserved_bytes
|
|
|
|
: WriteBatchInternal::kHeader);
|
|
|
|
rep_.resize(WriteBatchInternal::kHeader);
|
|
|
|
}
|
|
|
|
|
2015-11-06 15:03:30 +00:00
|
|
|
WriteBatch::WriteBatch(const std::string& rep)
|
2021-09-12 22:33:15 +00:00
|
|
|
: content_flags_(ContentFlags::DEFERRED), max_bytes_(0), rep_(rep) {}
|
2015-11-06 15:03:30 +00:00
|
|
|
|
2018-01-11 23:35:21 +00:00
|
|
|
WriteBatch::WriteBatch(std::string&& rep)
|
refactor SavePoints (#5192)
Summary:
Savepoints are assumed to be used in a stack-wise fashion (only
the top element should be used), so they were stored by `WriteBatch`
in a member variable `save_points` using an std::stack.
Conceptually this is fine, but the implementation had a few issues:
- the `save_points_` instance variable was a plain pointer to a heap-
allocated `SavePoints` struct. The destructor of `WriteBatch` simply
deletes this pointer. However, the copy constructor of WriteBatch
just copied that pointer, meaning that copying a WriteBatch with
active savepoints will very likely have crashed before. Now a proper
copy of the savepoints is made in the copy constructor, and not just
a copy of the pointer
- `save_points_` was an std::stack, which defaults to `std::deque` for
the underlying container. A deque is a bit over the top here, as we
only need access to the most recent savepoint (i.e. stack.top()) but
never any elements at the front. std::deque is rather expensive to
initialize in common environments. For example, the STL implementation
shipped with GNU g++ will perform a heap allocation of more than 500
bytes to create an empty deque object. Although the `save_points_`
container is created lazily by RocksDB, moving from a deque to a plain
`std::vector` is much more memory-efficient. So `save_points_` is now
a vector.
- `save_points_` was changed from a plain pointer to an `std::unique_ptr`,
making ownership more explicit.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5192
Differential Revision: D15024074
Pulled By: maysamyabandeh
fbshipit-source-id: 5b128786d3789cde94e46465c9e91badd07a25d7
2019-04-20 03:30:03 +00:00
|
|
|
: content_flags_(ContentFlags::DEFERRED),
|
2018-01-11 23:35:21 +00:00
|
|
|
max_bytes_(0),
|
2021-09-12 22:33:15 +00:00
|
|
|
rep_(std::move(rep)) {}
|
2018-01-11 23:35:21 +00:00
|
|
|
|
2015-11-06 15:03:30 +00:00
|
|
|
WriteBatch::WriteBatch(const WriteBatch& src)
|
refactor SavePoints (#5192)
Summary:
Savepoints are assumed to be used in a stack-wise fashion (only
the top element should be used), so they were stored by `WriteBatch`
in a member variable `save_points` using an std::stack.
Conceptually this is fine, but the implementation had a few issues:
- the `save_points_` instance variable was a plain pointer to a heap-
allocated `SavePoints` struct. The destructor of `WriteBatch` simply
deletes this pointer. However, the copy constructor of WriteBatch
just copied that pointer, meaning that copying a WriteBatch with
active savepoints will very likely have crashed before. Now a proper
copy of the savepoints is made in the copy constructor, and not just
a copy of the pointer
- `save_points_` was an std::stack, which defaults to `std::deque` for
the underlying container. A deque is a bit over the top here, as we
only need access to the most recent savepoint (i.e. stack.top()) but
never any elements at the front. std::deque is rather expensive to
initialize in common environments. For example, the STL implementation
shipped with GNU g++ will perform a heap allocation of more than 500
bytes to create an empty deque object. Although the `save_points_`
container is created lazily by RocksDB, moving from a deque to a plain
`std::vector` is much more memory-efficient. So `save_points_` is now
a vector.
- `save_points_` was changed from a plain pointer to an `std::unique_ptr`,
making ownership more explicit.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5192
Differential Revision: D15024074
Pulled By: maysamyabandeh
fbshipit-source-id: 5b128786d3789cde94e46465c9e91badd07a25d7
2019-04-20 03:30:03 +00:00
|
|
|
: wal_term_point_(src.wal_term_point_),
|
2015-11-06 15:03:30 +00:00
|
|
|
content_flags_(src.content_flags_.load(std::memory_order_relaxed)),
|
2017-04-10 22:38:34 +00:00
|
|
|
max_bytes_(src.max_bytes_),
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
default_cf_ts_sz_(src.default_cf_ts_sz_),
|
2021-09-12 22:33:15 +00:00
|
|
|
rep_(src.rep_) {
|
refactor SavePoints (#5192)
Summary:
Savepoints are assumed to be used in a stack-wise fashion (only
the top element should be used), so they were stored by `WriteBatch`
in a member variable `save_points` using an std::stack.
Conceptually this is fine, but the implementation had a few issues:
- the `save_points_` instance variable was a plain pointer to a heap-
allocated `SavePoints` struct. The destructor of `WriteBatch` simply
deletes this pointer. However, the copy constructor of WriteBatch
just copied that pointer, meaning that copying a WriteBatch with
active savepoints will very likely have crashed before. Now a proper
copy of the savepoints is made in the copy constructor, and not just
a copy of the pointer
- `save_points_` was an std::stack, which defaults to `std::deque` for
the underlying container. A deque is a bit over the top here, as we
only need access to the most recent savepoint (i.e. stack.top()) but
never any elements at the front. std::deque is rather expensive to
initialize in common environments. For example, the STL implementation
shipped with GNU g++ will perform a heap allocation of more than 500
bytes to create an empty deque object. Although the `save_points_`
container is created lazily by RocksDB, moving from a deque to a plain
`std::vector` is much more memory-efficient. So `save_points_` is now
a vector.
- `save_points_` was changed from a plain pointer to an `std::unique_ptr`,
making ownership more explicit.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5192
Differential Revision: D15024074
Pulled By: maysamyabandeh
fbshipit-source-id: 5b128786d3789cde94e46465c9e91badd07a25d7
2019-04-20 03:30:03 +00:00
|
|
|
if (src.save_points_ != nullptr) {
|
|
|
|
save_points_.reset(new SavePoints());
|
|
|
|
save_points_->stack = src.save_points_->stack;
|
|
|
|
}
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
if (src.prot_info_ != nullptr) {
|
|
|
|
prot_info_.reset(new WriteBatch::ProtectionInfo());
|
|
|
|
prot_info_->entries_ = src.prot_info_->entries_;
|
|
|
|
}
|
refactor SavePoints (#5192)
Summary:
Savepoints are assumed to be used in a stack-wise fashion (only
the top element should be used), so they were stored by `WriteBatch`
in a member variable `save_points` using an std::stack.
Conceptually this is fine, but the implementation had a few issues:
- the `save_points_` instance variable was a plain pointer to a heap-
allocated `SavePoints` struct. The destructor of `WriteBatch` simply
deletes this pointer. However, the copy constructor of WriteBatch
just copied that pointer, meaning that copying a WriteBatch with
active savepoints will very likely have crashed before. Now a proper
copy of the savepoints is made in the copy constructor, and not just
a copy of the pointer
- `save_points_` was an std::stack, which defaults to `std::deque` for
the underlying container. A deque is a bit over the top here, as we
only need access to the most recent savepoint (i.e. stack.top()) but
never any elements at the front. std::deque is rather expensive to
initialize in common environments. For example, the STL implementation
shipped with GNU g++ will perform a heap allocation of more than 500
bytes to create an empty deque object. Although the `save_points_`
container is created lazily by RocksDB, moving from a deque to a plain
`std::vector` is much more memory-efficient. So `save_points_` is now
a vector.
- `save_points_` was changed from a plain pointer to an `std::unique_ptr`,
making ownership more explicit.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5192
Differential Revision: D15024074
Pulled By: maysamyabandeh
fbshipit-source-id: 5b128786d3789cde94e46465c9e91badd07a25d7
2019-04-20 03:30:03 +00:00
|
|
|
}
|
2015-11-06 15:03:30 +00:00
|
|
|
|
2017-09-18 21:36:53 +00:00
|
|
|
WriteBatch::WriteBatch(WriteBatch&& src) noexcept
|
2015-11-06 15:03:30 +00:00
|
|
|
: save_points_(std::move(src.save_points_)),
|
Add facility to write only a portion of WriteBatch to WAL
Summary:
When constructing a write batch a client may now call MarkWalTerminationPoint() on that batch. No batch operations after this call will be added written to the WAL but will still be inserted into the Memtable. This facility is used to remove one of the three WriteImpl calls in 2PC transactions. This produces a ~1% perf improvement.
```
RocksDB - unoptimized 2pc, sync_binlog=1, disable_2pc=off
INFO 2016-08-31 14:30:38,814 [main]: REQUEST PHASE COMPLETED. 75000000 requests done in 2619 seconds. Requests/second = 28628
RocksDB - optimized 2pc , sync_binlog=1, disable_2pc=off
INFO 2016-08-31 16:26:59,442 [main]: REQUEST PHASE COMPLETED. 75000000 requests done in 2581 seconds. Requests/second = 29054
```
Test Plan: Two unit tests added.
Reviewers: sdong, yiwu, IslamAbdelRahman
Reviewed By: yiwu
Subscribers: hermanlee4, dhruba, andrewkr
Differential Revision: https://reviews.facebook.net/D64599
2016-10-07 18:31:26 +00:00
|
|
|
wal_term_point_(std::move(src.wal_term_point_)),
|
2015-11-06 15:03:30 +00:00
|
|
|
content_flags_(src.content_flags_.load(std::memory_order_relaxed)),
|
2017-04-10 22:38:34 +00:00
|
|
|
max_bytes_(src.max_bytes_),
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
prot_info_(std::move(src.prot_info_)),
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
default_cf_ts_sz_(src.default_cf_ts_sz_),
|
2021-09-12 22:33:15 +00:00
|
|
|
rep_(std::move(src.rep_)) {}
|
2015-11-06 15:03:30 +00:00
|
|
|
|
|
|
|
WriteBatch& WriteBatch::operator=(const WriteBatch& src) {
|
|
|
|
if (&src != this) {
|
|
|
|
this->~WriteBatch();
|
|
|
|
new (this) WriteBatch(src);
|
|
|
|
}
|
|
|
|
return *this;
|
|
|
|
}
|
|
|
|
|
|
|
|
WriteBatch& WriteBatch::operator=(WriteBatch&& src) {
|
|
|
|
if (&src != this) {
|
|
|
|
this->~WriteBatch();
|
|
|
|
new (this) WriteBatch(std::move(src));
|
|
|
|
}
|
|
|
|
return *this;
|
2011-03-18 22:37:00 +00:00
|
|
|
}
|
|
|
|
|
2023-12-04 19:17:32 +00:00
|
|
|
WriteBatch::~WriteBatch() = default;
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2023-12-04 19:17:32 +00:00
|
|
|
WriteBatch::Handler::~Handler() = default;
|
2011-05-21 02:17:43 +00:00
|
|
|
|
2018-03-05 21:08:17 +00:00
|
|
|
void WriteBatch::Handler::LogData(const Slice& /*blob*/) {
|
2013-08-14 23:32:46 +00:00
|
|
|
// If the user has not specified something to do with blobs, then we ignore
|
|
|
|
// them.
|
|
|
|
}
|
|
|
|
|
2022-11-02 21:34:24 +00:00
|
|
|
bool WriteBatch::Handler::Continue() { return true; }
|
2013-08-22 01:27:48 +00:00
|
|
|
|
2011-03-18 22:37:00 +00:00
|
|
|
void WriteBatch::Clear() {
|
|
|
|
rep_.clear();
|
2016-03-30 17:43:00 +00:00
|
|
|
rep_.resize(WriteBatchInternal::kHeader);
|
2015-07-11 03:15:45 +00:00
|
|
|
|
2015-11-06 15:03:30 +00:00
|
|
|
content_flags_.store(0, std::memory_order_relaxed);
|
|
|
|
|
2015-07-11 03:15:45 +00:00
|
|
|
if (save_points_ != nullptr) {
|
|
|
|
while (!save_points_->stack.empty()) {
|
|
|
|
save_points_->stack.pop();
|
|
|
|
}
|
|
|
|
}
|
Add facility to write only a portion of WriteBatch to WAL
Summary:
When constructing a write batch a client may now call MarkWalTerminationPoint() on that batch. No batch operations after this call will be added written to the WAL but will still be inserted into the Memtable. This facility is used to remove one of the three WriteImpl calls in 2PC transactions. This produces a ~1% perf improvement.
```
RocksDB - unoptimized 2pc, sync_binlog=1, disable_2pc=off
INFO 2016-08-31 14:30:38,814 [main]: REQUEST PHASE COMPLETED. 75000000 requests done in 2619 seconds. Requests/second = 28628
RocksDB - optimized 2pc , sync_binlog=1, disable_2pc=off
INFO 2016-08-31 16:26:59,442 [main]: REQUEST PHASE COMPLETED. 75000000 requests done in 2581 seconds. Requests/second = 29054
```
Test Plan: Two unit tests added.
Reviewers: sdong, yiwu, IslamAbdelRahman
Reviewed By: yiwu
Subscribers: hermanlee4, dhruba, andrewkr
Differential Revision: https://reviews.facebook.net/D64599
2016-10-07 18:31:26 +00:00
|
|
|
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
if (prot_info_ != nullptr) {
|
|
|
|
prot_info_->entries_.clear();
|
|
|
|
}
|
Add facility to write only a portion of WriteBatch to WAL
Summary:
When constructing a write batch a client may now call MarkWalTerminationPoint() on that batch. No batch operations after this call will be added written to the WAL but will still be inserted into the Memtable. This facility is used to remove one of the three WriteImpl calls in 2PC transactions. This produces a ~1% perf improvement.
```
RocksDB - unoptimized 2pc, sync_binlog=1, disable_2pc=off
INFO 2016-08-31 14:30:38,814 [main]: REQUEST PHASE COMPLETED. 75000000 requests done in 2619 seconds. Requests/second = 28628
RocksDB - optimized 2pc , sync_binlog=1, disable_2pc=off
INFO 2016-08-31 16:26:59,442 [main]: REQUEST PHASE COMPLETED. 75000000 requests done in 2581 seconds. Requests/second = 29054
```
Test Plan: Two unit tests added.
Reviewers: sdong, yiwu, IslamAbdelRahman
Reviewed By: yiwu
Subscribers: hermanlee4, dhruba, andrewkr
Differential Revision: https://reviews.facebook.net/D64599
2016-10-07 18:31:26 +00:00
|
|
|
wal_term_point_.clear();
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
default_cf_ts_sz_ = 0;
|
2011-03-18 22:37:00 +00:00
|
|
|
}
|
|
|
|
|
2019-09-20 19:00:55 +00:00
|
|
|
uint32_t WriteBatch::Count() const { return WriteBatchInternal::Count(this); }
|
2013-06-26 17:50:58 +00:00
|
|
|
|
2015-11-06 15:03:30 +00:00
|
|
|
uint32_t WriteBatch::ComputeContentFlags() const {
|
|
|
|
auto rv = content_flags_.load(std::memory_order_relaxed);
|
|
|
|
if ((rv & ContentFlags::DEFERRED) != 0) {
|
|
|
|
BatchContentClassifier classifier;
|
2020-10-01 22:57:28 +00:00
|
|
|
// Should we handle status here?
|
|
|
|
Iterate(&classifier).PermitUncheckedError();
|
2015-11-06 15:03:30 +00:00
|
|
|
rv = classifier.content_flags;
|
|
|
|
|
|
|
|
// this method is conceptually const, because it is performing a lazy
|
|
|
|
// computation that doesn't affect the abstract state of the batch.
|
|
|
|
// content_flags_ is marked mutable so that we can perform the
|
|
|
|
// following assignment
|
|
|
|
content_flags_.store(rv, std::memory_order_relaxed);
|
|
|
|
}
|
|
|
|
return rv;
|
|
|
|
}
|
|
|
|
|
Add facility to write only a portion of WriteBatch to WAL
Summary:
When constructing a write batch a client may now call MarkWalTerminationPoint() on that batch. No batch operations after this call will be added written to the WAL but will still be inserted into the Memtable. This facility is used to remove one of the three WriteImpl calls in 2PC transactions. This produces a ~1% perf improvement.
```
RocksDB - unoptimized 2pc, sync_binlog=1, disable_2pc=off
INFO 2016-08-31 14:30:38,814 [main]: REQUEST PHASE COMPLETED. 75000000 requests done in 2619 seconds. Requests/second = 28628
RocksDB - optimized 2pc , sync_binlog=1, disable_2pc=off
INFO 2016-08-31 16:26:59,442 [main]: REQUEST PHASE COMPLETED. 75000000 requests done in 2581 seconds. Requests/second = 29054
```
Test Plan: Two unit tests added.
Reviewers: sdong, yiwu, IslamAbdelRahman
Reviewed By: yiwu
Subscribers: hermanlee4, dhruba, andrewkr
Differential Revision: https://reviews.facebook.net/D64599
2016-10-07 18:31:26 +00:00
|
|
|
void WriteBatch::MarkWalTerminationPoint() {
|
|
|
|
wal_term_point_.size = GetDataSize();
|
|
|
|
wal_term_point_.count = Count();
|
|
|
|
wal_term_point_.content_flags = content_flags_;
|
|
|
|
}
|
|
|
|
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
size_t WriteBatch::GetProtectionBytesPerKey() const {
|
|
|
|
if (prot_info_ != nullptr) {
|
|
|
|
return prot_info_->GetBytesPerKey();
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2023-05-27 01:15:14 +00:00
|
|
|
std::string WriteBatch::Release() {
|
|
|
|
std::string ret = std::move(rep_);
|
|
|
|
Clear();
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2015-11-06 15:03:30 +00:00
|
|
|
bool WriteBatch::HasPut() const {
|
|
|
|
return (ComputeContentFlags() & ContentFlags::HAS_PUT) != 0;
|
|
|
|
}
|
|
|
|
|
Add initial support for TimedPut API (#12419)
Summary:
This PR adds support for `TimedPut` API. We introduced a new type `kTypeValuePreferredSeqno` for entries added to the DB via the `TimedPut` API.
The life cycle of such an entry on the write/flush/compaction paths are:
1) It is initially added to memtable as:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, write_unix_time}`
2) When it's flushed to L0 sst files, it's converted to:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, preferred_seqno}`
when we have easy access to the seqno to time mapping.
3) During compaction, if certain conditions are met, we swap in the `preferred_seqno` and the entry will become:
`<user_key, preferred_seqno, kTypeValue>: value`. This step helps fast track these entries to the cold tier if they are eligible after the sequence number swap.
On the read path:
A `kTypeValuePreferredSeqno` entry acts the same as a `kTypeValue` entry, the unix_write_time/preferred seqno part packed in value is completely ignored.
Needed follow ups:
1) The seqno to time mapping accessible in flush needs to be extended to cover the `write_unix_time` for possible `kTypeValuePreferredSeqno` entries. This also means we need to track these `write_unix_time` in memtable.
2) Compaction filter support for the new `kTypeValuePreferredSeqno` type for feature parity with other `kTypeValue` and equivalent types.
3) Stress test coverage for the feature
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12419
Test Plan: Added unit tests
Reviewed By: pdillinger
Differential Revision: D54920296
Pulled By: jowlyzhang
fbshipit-source-id: c8b43f7a7c465e569141770e93c748371ff1da9e
2024-03-14 22:44:55 +00:00
|
|
|
bool WriteBatch::HasTimedPut() const {
|
|
|
|
return (ComputeContentFlags() & ContentFlags::HAS_TIMED_PUT) != 0;
|
|
|
|
}
|
|
|
|
|
2022-06-25 22:30:47 +00:00
|
|
|
bool WriteBatch::HasPutEntity() const {
|
|
|
|
return (ComputeContentFlags() & ContentFlags::HAS_PUT_ENTITY) != 0;
|
|
|
|
}
|
|
|
|
|
2015-11-06 15:03:30 +00:00
|
|
|
bool WriteBatch::HasDelete() const {
|
|
|
|
return (ComputeContentFlags() & ContentFlags::HAS_DELETE) != 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool WriteBatch::HasSingleDelete() const {
|
|
|
|
return (ComputeContentFlags() & ContentFlags::HAS_SINGLE_DELETE) != 0;
|
|
|
|
}
|
|
|
|
|
2016-08-16 15:16:04 +00:00
|
|
|
bool WriteBatch::HasDeleteRange() const {
|
|
|
|
return (ComputeContentFlags() & ContentFlags::HAS_DELETE_RANGE) != 0;
|
|
|
|
}
|
|
|
|
|
2015-11-06 15:03:30 +00:00
|
|
|
bool WriteBatch::HasMerge() const {
|
|
|
|
return (ComputeContentFlags() & ContentFlags::HAS_MERGE) != 0;
|
|
|
|
}
|
|
|
|
|
2016-04-01 22:23:46 +00:00
|
|
|
bool ReadKeyFromWriteBatchEntry(Slice* input, Slice* key, bool cf_record) {
|
|
|
|
assert(input != nullptr && key != nullptr);
|
|
|
|
// Skip tag byte
|
|
|
|
input->remove_prefix(1);
|
|
|
|
|
|
|
|
if (cf_record) {
|
|
|
|
// Skip column_family bytes
|
|
|
|
uint32_t cf;
|
|
|
|
if (!GetVarint32(input, &cf)) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Extract key
|
|
|
|
return GetLengthPrefixedSlice(input, key);
|
|
|
|
}
|
|
|
|
|
Modification of WriteBatch to support two phase commit
Summary: Adds three new WriteBatch data types: Prepare(xid), Commit(xid), Rollback(xid). Prepare(xid) should precede the (single) operation to which is applies. There can obviously be multiple Prepare(xid) markers. There should only be one Rollback(xid) or Commit(xid) marker yet not both. None of this logic is currently enforced and will most likely be implemented further up such as in the memtableinserter. All three markers are similar to PutLogData in that they are writebatch meta-data, ie stored but not counted. All three markers differ from PutLogData in that they will actually be written to disk. As for WriteBatchWithIndex, Prepare, Commit, Rollback are all implemented just as PutLogData and none are tested just as PutLogData.
Test Plan: single unit test in write_batch_test.
Reviewers: hermanlee4, sdong, anthony
Subscribers: leveldb, dhruba, vasilep, andrewkr
Differential Revision: https://reviews.facebook.net/D57867
2016-04-08 06:35:51 +00:00
|
|
|
bool WriteBatch::HasBeginPrepare() const {
|
|
|
|
return (ComputeContentFlags() & ContentFlags::HAS_BEGIN_PREPARE) != 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool WriteBatch::HasEndPrepare() const {
|
|
|
|
return (ComputeContentFlags() & ContentFlags::HAS_END_PREPARE) != 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool WriteBatch::HasCommit() const {
|
|
|
|
return (ComputeContentFlags() & ContentFlags::HAS_COMMIT) != 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool WriteBatch::HasRollback() const {
|
|
|
|
return (ComputeContentFlags() & ContentFlags::HAS_ROLLBACK) != 0;
|
|
|
|
}
|
|
|
|
|
2014-08-18 22:19:17 +00:00
|
|
|
Status ReadRecordFromWriteBatch(Slice* input, char* tag,
|
|
|
|
uint32_t* column_family, Slice* key,
|
Add initial support for TimedPut API (#12419)
Summary:
This PR adds support for `TimedPut` API. We introduced a new type `kTypeValuePreferredSeqno` for entries added to the DB via the `TimedPut` API.
The life cycle of such an entry on the write/flush/compaction paths are:
1) It is initially added to memtable as:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, write_unix_time}`
2) When it's flushed to L0 sst files, it's converted to:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, preferred_seqno}`
when we have easy access to the seqno to time mapping.
3) During compaction, if certain conditions are met, we swap in the `preferred_seqno` and the entry will become:
`<user_key, preferred_seqno, kTypeValue>: value`. This step helps fast track these entries to the cold tier if they are eligible after the sequence number swap.
On the read path:
A `kTypeValuePreferredSeqno` entry acts the same as a `kTypeValue` entry, the unix_write_time/preferred seqno part packed in value is completely ignored.
Needed follow ups:
1) The seqno to time mapping accessible in flush needs to be extended to cover the `write_unix_time` for possible `kTypeValuePreferredSeqno` entries. This also means we need to track these `write_unix_time` in memtable.
2) Compaction filter support for the new `kTypeValuePreferredSeqno` type for feature parity with other `kTypeValue` and equivalent types.
3) Stress test coverage for the feature
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12419
Test Plan: Added unit tests
Reviewed By: pdillinger
Differential Revision: D54920296
Pulled By: jowlyzhang
fbshipit-source-id: c8b43f7a7c465e569141770e93c748371ff1da9e
2024-03-14 22:44:55 +00:00
|
|
|
Slice* value, Slice* blob, Slice* xid,
|
|
|
|
uint64_t* write_unix_time) {
|
2014-08-18 22:19:17 +00:00
|
|
|
assert(key != nullptr && value != nullptr);
|
|
|
|
*tag = (*input)[0];
|
|
|
|
input->remove_prefix(1);
|
|
|
|
*column_family = 0; // default
|
|
|
|
switch (*tag) {
|
|
|
|
case kTypeColumnFamilyValue:
|
|
|
|
if (!GetVarint32(input, column_family)) {
|
|
|
|
return Status::Corruption("bad WriteBatch Put");
|
|
|
|
}
|
2018-07-13 17:47:49 +00:00
|
|
|
FALLTHROUGH_INTENDED;
|
2014-08-18 22:19:17 +00:00
|
|
|
case kTypeValue:
|
|
|
|
if (!GetLengthPrefixedSlice(input, key) ||
|
|
|
|
!GetLengthPrefixedSlice(input, value)) {
|
|
|
|
return Status::Corruption("bad WriteBatch Put");
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case kTypeColumnFamilyDeletion:
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 18:42:56 +00:00
|
|
|
case kTypeColumnFamilySingleDeletion:
|
2014-08-18 22:19:17 +00:00
|
|
|
if (!GetVarint32(input, column_family)) {
|
|
|
|
return Status::Corruption("bad WriteBatch Delete");
|
|
|
|
}
|
2018-07-13 17:47:49 +00:00
|
|
|
FALLTHROUGH_INTENDED;
|
2014-08-18 22:19:17 +00:00
|
|
|
case kTypeDeletion:
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 18:42:56 +00:00
|
|
|
case kTypeSingleDeletion:
|
2014-08-18 22:19:17 +00:00
|
|
|
if (!GetLengthPrefixedSlice(input, key)) {
|
|
|
|
return Status::Corruption("bad WriteBatch Delete");
|
|
|
|
}
|
|
|
|
break;
|
2016-08-16 15:16:04 +00:00
|
|
|
case kTypeColumnFamilyRangeDeletion:
|
|
|
|
if (!GetVarint32(input, column_family)) {
|
|
|
|
return Status::Corruption("bad WriteBatch DeleteRange");
|
|
|
|
}
|
2018-07-13 17:47:49 +00:00
|
|
|
FALLTHROUGH_INTENDED;
|
2016-08-16 15:16:04 +00:00
|
|
|
case kTypeRangeDeletion:
|
|
|
|
// for range delete, "key" is begin_key, "value" is end_key
|
|
|
|
if (!GetLengthPrefixedSlice(input, key) ||
|
|
|
|
!GetLengthPrefixedSlice(input, value)) {
|
|
|
|
return Status::Corruption("bad WriteBatch DeleteRange");
|
|
|
|
}
|
|
|
|
break;
|
2014-08-18 22:19:17 +00:00
|
|
|
case kTypeColumnFamilyMerge:
|
|
|
|
if (!GetVarint32(input, column_family)) {
|
|
|
|
return Status::Corruption("bad WriteBatch Merge");
|
|
|
|
}
|
2018-07-13 17:47:49 +00:00
|
|
|
FALLTHROUGH_INTENDED;
|
2014-08-18 22:19:17 +00:00
|
|
|
case kTypeMerge:
|
|
|
|
if (!GetLengthPrefixedSlice(input, key) ||
|
|
|
|
!GetLengthPrefixedSlice(input, value)) {
|
|
|
|
return Status::Corruption("bad WriteBatch Merge");
|
|
|
|
}
|
|
|
|
break;
|
2017-10-03 16:08:07 +00:00
|
|
|
case kTypeColumnFamilyBlobIndex:
|
|
|
|
if (!GetVarint32(input, column_family)) {
|
|
|
|
return Status::Corruption("bad WriteBatch BlobIndex");
|
|
|
|
}
|
2018-07-13 17:47:49 +00:00
|
|
|
FALLTHROUGH_INTENDED;
|
2017-10-03 16:08:07 +00:00
|
|
|
case kTypeBlobIndex:
|
|
|
|
if (!GetLengthPrefixedSlice(input, key) ||
|
|
|
|
!GetLengthPrefixedSlice(input, value)) {
|
|
|
|
return Status::Corruption("bad WriteBatch BlobIndex");
|
|
|
|
}
|
|
|
|
break;
|
2014-08-18 22:19:17 +00:00
|
|
|
case kTypeLogData:
|
|
|
|
assert(blob != nullptr);
|
|
|
|
if (!GetLengthPrefixedSlice(input, blob)) {
|
|
|
|
return Status::Corruption("bad WriteBatch Blob");
|
|
|
|
}
|
|
|
|
break;
|
Modification of WriteBatch to support two phase commit
Summary: Adds three new WriteBatch data types: Prepare(xid), Commit(xid), Rollback(xid). Prepare(xid) should precede the (single) operation to which is applies. There can obviously be multiple Prepare(xid) markers. There should only be one Rollback(xid) or Commit(xid) marker yet not both. None of this logic is currently enforced and will most likely be implemented further up such as in the memtableinserter. All three markers are similar to PutLogData in that they are writebatch meta-data, ie stored but not counted. All three markers differ from PutLogData in that they will actually be written to disk. As for WriteBatchWithIndex, Prepare, Commit, Rollback are all implemented just as PutLogData and none are tested just as PutLogData.
Test Plan: single unit test in write_batch_test.
Reviewers: hermanlee4, sdong, anthony
Subscribers: leveldb, dhruba, vasilep, andrewkr
Differential Revision: https://reviews.facebook.net/D57867
2016-04-08 06:35:51 +00:00
|
|
|
case kTypeNoop:
|
|
|
|
case kTypeBeginPrepareXID:
|
2017-11-11 19:23:43 +00:00
|
|
|
// This indicates that the prepared batch is also persisted in the db.
|
|
|
|
// This is used in WritePreparedTxn
|
|
|
|
case kTypeBeginPersistedPrepareXID:
|
2018-06-29 01:46:39 +00:00
|
|
|
// This is used in WriteUnpreparedTxn
|
|
|
|
case kTypeBeginUnprepareXID:
|
Modification of WriteBatch to support two phase commit
Summary: Adds three new WriteBatch data types: Prepare(xid), Commit(xid), Rollback(xid). Prepare(xid) should precede the (single) operation to which is applies. There can obviously be multiple Prepare(xid) markers. There should only be one Rollback(xid) or Commit(xid) marker yet not both. None of this logic is currently enforced and will most likely be implemented further up such as in the memtableinserter. All three markers are similar to PutLogData in that they are writebatch meta-data, ie stored but not counted. All three markers differ from PutLogData in that they will actually be written to disk. As for WriteBatchWithIndex, Prepare, Commit, Rollback are all implemented just as PutLogData and none are tested just as PutLogData.
Test Plan: single unit test in write_batch_test.
Reviewers: hermanlee4, sdong, anthony
Subscribers: leveldb, dhruba, vasilep, andrewkr
Differential Revision: https://reviews.facebook.net/D57867
2016-04-08 06:35:51 +00:00
|
|
|
break;
|
|
|
|
case kTypeEndPrepareXID:
|
|
|
|
if (!GetLengthPrefixedSlice(input, xid)) {
|
|
|
|
return Status::Corruption("bad EndPrepare XID");
|
|
|
|
}
|
|
|
|
break;
|
2021-12-10 19:03:39 +00:00
|
|
|
case kTypeCommitXIDAndTimestamp:
|
|
|
|
if (!GetLengthPrefixedSlice(input, key)) {
|
|
|
|
return Status::Corruption("bad commit timestamp");
|
|
|
|
}
|
|
|
|
FALLTHROUGH_INTENDED;
|
Modification of WriteBatch to support two phase commit
Summary: Adds three new WriteBatch data types: Prepare(xid), Commit(xid), Rollback(xid). Prepare(xid) should precede the (single) operation to which is applies. There can obviously be multiple Prepare(xid) markers. There should only be one Rollback(xid) or Commit(xid) marker yet not both. None of this logic is currently enforced and will most likely be implemented further up such as in the memtableinserter. All three markers are similar to PutLogData in that they are writebatch meta-data, ie stored but not counted. All three markers differ from PutLogData in that they will actually be written to disk. As for WriteBatchWithIndex, Prepare, Commit, Rollback are all implemented just as PutLogData and none are tested just as PutLogData.
Test Plan: single unit test in write_batch_test.
Reviewers: hermanlee4, sdong, anthony
Subscribers: leveldb, dhruba, vasilep, andrewkr
Differential Revision: https://reviews.facebook.net/D57867
2016-04-08 06:35:51 +00:00
|
|
|
case kTypeCommitXID:
|
|
|
|
if (!GetLengthPrefixedSlice(input, xid)) {
|
|
|
|
return Status::Corruption("bad Commit XID");
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case kTypeRollbackXID:
|
|
|
|
if (!GetLengthPrefixedSlice(input, xid)) {
|
|
|
|
return Status::Corruption("bad Rollback XID");
|
|
|
|
}
|
|
|
|
break;
|
2022-06-25 22:30:47 +00:00
|
|
|
case kTypeColumnFamilyWideColumnEntity:
|
|
|
|
if (!GetVarint32(input, column_family)) {
|
|
|
|
return Status::Corruption("bad WriteBatch PutEntity");
|
|
|
|
}
|
|
|
|
FALLTHROUGH_INTENDED;
|
|
|
|
case kTypeWideColumnEntity:
|
|
|
|
if (!GetLengthPrefixedSlice(input, key) ||
|
|
|
|
!GetLengthPrefixedSlice(input, value)) {
|
|
|
|
return Status::Corruption("bad WriteBatch PutEntity");
|
|
|
|
}
|
|
|
|
break;
|
Add initial support for TimedPut API (#12419)
Summary:
This PR adds support for `TimedPut` API. We introduced a new type `kTypeValuePreferredSeqno` for entries added to the DB via the `TimedPut` API.
The life cycle of such an entry on the write/flush/compaction paths are:
1) It is initially added to memtable as:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, write_unix_time}`
2) When it's flushed to L0 sst files, it's converted to:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, preferred_seqno}`
when we have easy access to the seqno to time mapping.
3) During compaction, if certain conditions are met, we swap in the `preferred_seqno` and the entry will become:
`<user_key, preferred_seqno, kTypeValue>: value`. This step helps fast track these entries to the cold tier if they are eligible after the sequence number swap.
On the read path:
A `kTypeValuePreferredSeqno` entry acts the same as a `kTypeValue` entry, the unix_write_time/preferred seqno part packed in value is completely ignored.
Needed follow ups:
1) The seqno to time mapping accessible in flush needs to be extended to cover the `write_unix_time` for possible `kTypeValuePreferredSeqno` entries. This also means we need to track these `write_unix_time` in memtable.
2) Compaction filter support for the new `kTypeValuePreferredSeqno` type for feature parity with other `kTypeValue` and equivalent types.
3) Stress test coverage for the feature
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12419
Test Plan: Added unit tests
Reviewed By: pdillinger
Differential Revision: D54920296
Pulled By: jowlyzhang
fbshipit-source-id: c8b43f7a7c465e569141770e93c748371ff1da9e
2024-03-14 22:44:55 +00:00
|
|
|
case kTypeColumnFamilyValuePreferredSeqno:
|
|
|
|
if (!GetVarint32(input, column_family)) {
|
|
|
|
return Status::Corruption("bad WriteBatch TimedPut");
|
|
|
|
}
|
|
|
|
FALLTHROUGH_INTENDED;
|
2024-04-30 22:40:35 +00:00
|
|
|
case kTypeValuePreferredSeqno: {
|
|
|
|
Slice packed_value;
|
Add initial support for TimedPut API (#12419)
Summary:
This PR adds support for `TimedPut` API. We introduced a new type `kTypeValuePreferredSeqno` for entries added to the DB via the `TimedPut` API.
The life cycle of such an entry on the write/flush/compaction paths are:
1) It is initially added to memtable as:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, write_unix_time}`
2) When it's flushed to L0 sst files, it's converted to:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, preferred_seqno}`
when we have easy access to the seqno to time mapping.
3) During compaction, if certain conditions are met, we swap in the `preferred_seqno` and the entry will become:
`<user_key, preferred_seqno, kTypeValue>: value`. This step helps fast track these entries to the cold tier if they are eligible after the sequence number swap.
On the read path:
A `kTypeValuePreferredSeqno` entry acts the same as a `kTypeValue` entry, the unix_write_time/preferred seqno part packed in value is completely ignored.
Needed follow ups:
1) The seqno to time mapping accessible in flush needs to be extended to cover the `write_unix_time` for possible `kTypeValuePreferredSeqno` entries. This also means we need to track these `write_unix_time` in memtable.
2) Compaction filter support for the new `kTypeValuePreferredSeqno` type for feature parity with other `kTypeValue` and equivalent types.
3) Stress test coverage for the feature
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12419
Test Plan: Added unit tests
Reviewed By: pdillinger
Differential Revision: D54920296
Pulled By: jowlyzhang
fbshipit-source-id: c8b43f7a7c465e569141770e93c748371ff1da9e
2024-03-14 22:44:55 +00:00
|
|
|
if (!GetLengthPrefixedSlice(input, key) ||
|
2024-04-30 22:40:35 +00:00
|
|
|
!GetLengthPrefixedSlice(input, &packed_value)) {
|
Add initial support for TimedPut API (#12419)
Summary:
This PR adds support for `TimedPut` API. We introduced a new type `kTypeValuePreferredSeqno` for entries added to the DB via the `TimedPut` API.
The life cycle of such an entry on the write/flush/compaction paths are:
1) It is initially added to memtable as:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, write_unix_time}`
2) When it's flushed to L0 sst files, it's converted to:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, preferred_seqno}`
when we have easy access to the seqno to time mapping.
3) During compaction, if certain conditions are met, we swap in the `preferred_seqno` and the entry will become:
`<user_key, preferred_seqno, kTypeValue>: value`. This step helps fast track these entries to the cold tier if they are eligible after the sequence number swap.
On the read path:
A `kTypeValuePreferredSeqno` entry acts the same as a `kTypeValue` entry, the unix_write_time/preferred seqno part packed in value is completely ignored.
Needed follow ups:
1) The seqno to time mapping accessible in flush needs to be extended to cover the `write_unix_time` for possible `kTypeValuePreferredSeqno` entries. This also means we need to track these `write_unix_time` in memtable.
2) Compaction filter support for the new `kTypeValuePreferredSeqno` type for feature parity with other `kTypeValue` and equivalent types.
3) Stress test coverage for the feature
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12419
Test Plan: Added unit tests
Reviewed By: pdillinger
Differential Revision: D54920296
Pulled By: jowlyzhang
fbshipit-source-id: c8b43f7a7c465e569141770e93c748371ff1da9e
2024-03-14 22:44:55 +00:00
|
|
|
return Status::Corruption("bad WriteBatch TimedPut");
|
|
|
|
}
|
2024-04-30 22:40:35 +00:00
|
|
|
if (write_unix_time) {
|
|
|
|
std::tie(*value, *write_unix_time) =
|
|
|
|
ParsePackedValueWithWriteTime(packed_value);
|
|
|
|
} else {
|
|
|
|
// Caller doesn't want to unpack write_unix_time, so keep it packed in
|
|
|
|
// the value.
|
|
|
|
*value = packed_value;
|
|
|
|
}
|
Add initial support for TimedPut API (#12419)
Summary:
This PR adds support for `TimedPut` API. We introduced a new type `kTypeValuePreferredSeqno` for entries added to the DB via the `TimedPut` API.
The life cycle of such an entry on the write/flush/compaction paths are:
1) It is initially added to memtable as:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, write_unix_time}`
2) When it's flushed to L0 sst files, it's converted to:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, preferred_seqno}`
when we have easy access to the seqno to time mapping.
3) During compaction, if certain conditions are met, we swap in the `preferred_seqno` and the entry will become:
`<user_key, preferred_seqno, kTypeValue>: value`. This step helps fast track these entries to the cold tier if they are eligible after the sequence number swap.
On the read path:
A `kTypeValuePreferredSeqno` entry acts the same as a `kTypeValue` entry, the unix_write_time/preferred seqno part packed in value is completely ignored.
Needed follow ups:
1) The seqno to time mapping accessible in flush needs to be extended to cover the `write_unix_time` for possible `kTypeValuePreferredSeqno` entries. This also means we need to track these `write_unix_time` in memtable.
2) Compaction filter support for the new `kTypeValuePreferredSeqno` type for feature parity with other `kTypeValue` and equivalent types.
3) Stress test coverage for the feature
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12419
Test Plan: Added unit tests
Reviewed By: pdillinger
Differential Revision: D54920296
Pulled By: jowlyzhang
fbshipit-source-id: c8b43f7a7c465e569141770e93c748371ff1da9e
2024-03-14 22:44:55 +00:00
|
|
|
break;
|
2024-04-30 22:40:35 +00:00
|
|
|
}
|
2014-08-18 22:19:17 +00:00
|
|
|
default:
|
2024-10-10 22:34:35 +00:00
|
|
|
return Status::Corruption(
|
|
|
|
"unknown WriteBatch tag",
|
|
|
|
std::to_string(static_cast<unsigned int>(*tag)));
|
2014-08-18 22:19:17 +00:00
|
|
|
}
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
2011-05-21 02:17:43 +00:00
|
|
|
Status WriteBatch::Iterate(Handler* handler) const {
|
2019-07-31 20:36:22 +00:00
|
|
|
if (rep_.size() < WriteBatchInternal::kHeader) {
|
2011-05-21 02:17:43 +00:00
|
|
|
return Status::Corruption("malformed WriteBatch (too small)");
|
|
|
|
}
|
|
|
|
|
2019-07-31 20:36:22 +00:00
|
|
|
return WriteBatchInternal::Iterate(this, handler, WriteBatchInternal::kHeader,
|
|
|
|
rep_.size());
|
|
|
|
}
|
|
|
|
|
|
|
|
Status WriteBatchInternal::Iterate(const WriteBatch* wb,
|
|
|
|
WriteBatch::Handler* handler, size_t begin,
|
|
|
|
size_t end) {
|
|
|
|
if (begin > wb->rep_.size() || end > wb->rep_.size() || end < begin) {
|
|
|
|
return Status::Corruption("Invalid start/end bounds for Iterate");
|
|
|
|
}
|
|
|
|
assert(begin <= end);
|
|
|
|
Slice input(wb->rep_.data() + begin, static_cast<size_t>(end - begin));
|
|
|
|
bool whole_batch =
|
|
|
|
(begin == WriteBatchInternal::kHeader) && (end == wb->rep_.size());
|
|
|
|
|
Modification of WriteBatch to support two phase commit
Summary: Adds three new WriteBatch data types: Prepare(xid), Commit(xid), Rollback(xid). Prepare(xid) should precede the (single) operation to which is applies. There can obviously be multiple Prepare(xid) markers. There should only be one Rollback(xid) or Commit(xid) marker yet not both. None of this logic is currently enforced and will most likely be implemented further up such as in the memtableinserter. All three markers are similar to PutLogData in that they are writebatch meta-data, ie stored but not counted. All three markers differ from PutLogData in that they will actually be written to disk. As for WriteBatchWithIndex, Prepare, Commit, Rollback are all implemented just as PutLogData and none are tested just as PutLogData.
Test Plan: single unit test in write_batch_test.
Reviewers: hermanlee4, sdong, anthony
Subscribers: leveldb, dhruba, vasilep, andrewkr
Differential Revision: https://reviews.facebook.net/D57867
2016-04-08 06:35:51 +00:00
|
|
|
Slice key, value, blob, xid;
|
Add initial support for TimedPut API (#12419)
Summary:
This PR adds support for `TimedPut` API. We introduced a new type `kTypeValuePreferredSeqno` for entries added to the DB via the `TimedPut` API.
The life cycle of such an entry on the write/flush/compaction paths are:
1) It is initially added to memtable as:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, write_unix_time}`
2) When it's flushed to L0 sst files, it's converted to:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, preferred_seqno}`
when we have easy access to the seqno to time mapping.
3) During compaction, if certain conditions are met, we swap in the `preferred_seqno` and the entry will become:
`<user_key, preferred_seqno, kTypeValue>: value`. This step helps fast track these entries to the cold tier if they are eligible after the sequence number swap.
On the read path:
A `kTypeValuePreferredSeqno` entry acts the same as a `kTypeValue` entry, the unix_write_time/preferred seqno part packed in value is completely ignored.
Needed follow ups:
1) The seqno to time mapping accessible in flush needs to be extended to cover the `write_unix_time` for possible `kTypeValuePreferredSeqno` entries. This also means we need to track these `write_unix_time` in memtable.
2) Compaction filter support for the new `kTypeValuePreferredSeqno` type for feature parity with other `kTypeValue` and equivalent types.
3) Stress test coverage for the feature
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12419
Test Plan: Added unit tests
Reviewed By: pdillinger
Differential Revision: D54920296
Pulled By: jowlyzhang
fbshipit-source-id: c8b43f7a7c465e569141770e93c748371ff1da9e
2024-03-14 22:44:55 +00:00
|
|
|
uint64_t write_unix_time = 0;
|
2022-06-25 22:30:47 +00:00
|
|
|
|
2017-09-28 23:43:04 +00:00
|
|
|
// Sometimes a sub-batch starts with a Noop. We want to exclude such Noops as
|
2018-03-08 18:18:34 +00:00
|
|
|
// the batch boundary symbols otherwise we would mis-count the number of
|
2017-09-28 23:43:04 +00:00
|
|
|
// batches. We do that by checking whether the accumulated batch is empty
|
|
|
|
// before seeing the next Noop.
|
|
|
|
bool empty_batch = true;
|
2019-09-09 18:22:28 +00:00
|
|
|
uint32_t found = 0;
|
2014-02-26 01:30:54 +00:00
|
|
|
Status s;
|
2018-02-06 02:32:54 +00:00
|
|
|
char tag = 0;
|
|
|
|
uint32_t column_family = 0; // default
|
2018-04-20 22:13:47 +00:00
|
|
|
bool last_was_try_again = false;
|
2018-10-11 03:55:42 +00:00
|
|
|
bool handler_continue = true;
|
|
|
|
while (((s.ok() && !input.empty()) || UNLIKELY(s.IsTryAgain()))) {
|
|
|
|
handler_continue = handler->Continue();
|
|
|
|
if (!handler_continue) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2018-02-16 16:36:47 +00:00
|
|
|
if (LIKELY(!s.IsTryAgain())) {
|
2018-04-20 22:13:47 +00:00
|
|
|
last_was_try_again = false;
|
2018-02-06 02:32:54 +00:00
|
|
|
tag = 0;
|
|
|
|
column_family = 0; // default
|
|
|
|
|
|
|
|
s = ReadRecordFromWriteBatch(&input, &tag, &column_family, &key, &value,
|
Add initial support for TimedPut API (#12419)
Summary:
This PR adds support for `TimedPut` API. We introduced a new type `kTypeValuePreferredSeqno` for entries added to the DB via the `TimedPut` API.
The life cycle of such an entry on the write/flush/compaction paths are:
1) It is initially added to memtable as:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, write_unix_time}`
2) When it's flushed to L0 sst files, it's converted to:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, preferred_seqno}`
when we have easy access to the seqno to time mapping.
3) During compaction, if certain conditions are met, we swap in the `preferred_seqno` and the entry will become:
`<user_key, preferred_seqno, kTypeValue>: value`. This step helps fast track these entries to the cold tier if they are eligible after the sequence number swap.
On the read path:
A `kTypeValuePreferredSeqno` entry acts the same as a `kTypeValue` entry, the unix_write_time/preferred seqno part packed in value is completely ignored.
Needed follow ups:
1) The seqno to time mapping accessible in flush needs to be extended to cover the `write_unix_time` for possible `kTypeValuePreferredSeqno` entries. This also means we need to track these `write_unix_time` in memtable.
2) Compaction filter support for the new `kTypeValuePreferredSeqno` type for feature parity with other `kTypeValue` and equivalent types.
3) Stress test coverage for the feature
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12419
Test Plan: Added unit tests
Reviewed By: pdillinger
Differential Revision: D54920296
Pulled By: jowlyzhang
fbshipit-source-id: c8b43f7a7c465e569141770e93c748371ff1da9e
2024-03-14 22:44:55 +00:00
|
|
|
&blob, &xid, &write_unix_time);
|
2018-02-06 02:32:54 +00:00
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
assert(s.IsTryAgain());
|
2019-07-31 20:36:22 +00:00
|
|
|
assert(!last_was_try_again); // to detect infinite loop bugs
|
2018-04-20 22:13:47 +00:00
|
|
|
if (UNLIKELY(last_was_try_again)) {
|
|
|
|
return Status::Corruption(
|
|
|
|
"two consecutive TryAgain in WriteBatch handler; this is either a "
|
|
|
|
"software bug or data corruption.");
|
|
|
|
}
|
|
|
|
last_was_try_again = true;
|
2018-02-06 02:32:54 +00:00
|
|
|
s = Status::OK();
|
2014-08-18 22:19:17 +00:00
|
|
|
}
|
|
|
|
|
2011-05-21 02:17:43 +00:00
|
|
|
switch (tag) {
|
2014-01-07 22:41:42 +00:00
|
|
|
case kTypeColumnFamilyValue:
|
2011-05-21 02:17:43 +00:00
|
|
|
case kTypeValue:
|
2019-07-31 20:36:22 +00:00
|
|
|
assert(wb->content_flags_.load(std::memory_order_relaxed) &
|
2015-11-06 15:03:30 +00:00
|
|
|
(ContentFlags::DEFERRED | ContentFlags::HAS_PUT));
|
2014-08-18 22:19:17 +00:00
|
|
|
s = handler->PutCF(column_family, key, value);
|
2018-02-16 16:36:47 +00:00
|
|
|
if (LIKELY(s.ok())) {
|
2018-02-06 02:32:54 +00:00
|
|
|
empty_batch = false;
|
|
|
|
found++;
|
|
|
|
}
|
2011-05-21 02:17:43 +00:00
|
|
|
break;
|
2014-01-07 22:41:42 +00:00
|
|
|
case kTypeColumnFamilyDeletion:
|
2011-05-21 02:17:43 +00:00
|
|
|
case kTypeDeletion:
|
2019-07-31 20:36:22 +00:00
|
|
|
assert(wb->content_flags_.load(std::memory_order_relaxed) &
|
2015-11-06 15:03:30 +00:00
|
|
|
(ContentFlags::DEFERRED | ContentFlags::HAS_DELETE));
|
2014-08-18 22:19:17 +00:00
|
|
|
s = handler->DeleteCF(column_family, key);
|
2018-02-16 16:36:47 +00:00
|
|
|
if (LIKELY(s.ok())) {
|
2018-02-06 02:32:54 +00:00
|
|
|
empty_batch = false;
|
|
|
|
found++;
|
|
|
|
}
|
2011-05-21 02:17:43 +00:00
|
|
|
break;
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 18:42:56 +00:00
|
|
|
case kTypeColumnFamilySingleDeletion:
|
|
|
|
case kTypeSingleDeletion:
|
2019-07-31 20:36:22 +00:00
|
|
|
assert(wb->content_flags_.load(std::memory_order_relaxed) &
|
2015-11-06 15:03:30 +00:00
|
|
|
(ContentFlags::DEFERRED | ContentFlags::HAS_SINGLE_DELETE));
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 18:42:56 +00:00
|
|
|
s = handler->SingleDeleteCF(column_family, key);
|
2018-02-16 16:36:47 +00:00
|
|
|
if (LIKELY(s.ok())) {
|
2018-02-06 02:32:54 +00:00
|
|
|
empty_batch = false;
|
|
|
|
found++;
|
|
|
|
}
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 18:42:56 +00:00
|
|
|
break;
|
2016-08-16 15:16:04 +00:00
|
|
|
case kTypeColumnFamilyRangeDeletion:
|
|
|
|
case kTypeRangeDeletion:
|
2019-07-31 20:36:22 +00:00
|
|
|
assert(wb->content_flags_.load(std::memory_order_relaxed) &
|
2016-08-16 15:16:04 +00:00
|
|
|
(ContentFlags::DEFERRED | ContentFlags::HAS_DELETE_RANGE));
|
|
|
|
s = handler->DeleteRangeCF(column_family, key, value);
|
2018-02-16 16:36:47 +00:00
|
|
|
if (LIKELY(s.ok())) {
|
2018-02-06 02:32:54 +00:00
|
|
|
empty_batch = false;
|
|
|
|
found++;
|
|
|
|
}
|
2016-08-16 15:16:04 +00:00
|
|
|
break;
|
2014-01-07 22:41:42 +00:00
|
|
|
case kTypeColumnFamilyMerge:
|
2013-03-21 22:59:47 +00:00
|
|
|
case kTypeMerge:
|
2019-07-31 20:36:22 +00:00
|
|
|
assert(wb->content_flags_.load(std::memory_order_relaxed) &
|
2015-11-06 15:03:30 +00:00
|
|
|
(ContentFlags::DEFERRED | ContentFlags::HAS_MERGE));
|
2014-08-18 22:19:17 +00:00
|
|
|
s = handler->MergeCF(column_family, key, value);
|
2018-02-16 16:36:47 +00:00
|
|
|
if (LIKELY(s.ok())) {
|
2018-02-06 02:32:54 +00:00
|
|
|
empty_batch = false;
|
|
|
|
found++;
|
|
|
|
}
|
2013-03-21 22:59:47 +00:00
|
|
|
break;
|
2017-10-03 16:08:07 +00:00
|
|
|
case kTypeColumnFamilyBlobIndex:
|
|
|
|
case kTypeBlobIndex:
|
2019-07-31 20:36:22 +00:00
|
|
|
assert(wb->content_flags_.load(std::memory_order_relaxed) &
|
2017-10-03 16:08:07 +00:00
|
|
|
(ContentFlags::DEFERRED | ContentFlags::HAS_BLOB_INDEX));
|
|
|
|
s = handler->PutBlobIndexCF(column_family, key, value);
|
2018-02-16 16:36:47 +00:00
|
|
|
if (LIKELY(s.ok())) {
|
2018-02-06 02:32:54 +00:00
|
|
|
found++;
|
|
|
|
}
|
2017-10-03 16:08:07 +00:00
|
|
|
break;
|
2013-08-14 23:32:46 +00:00
|
|
|
case kTypeLogData:
|
2014-08-18 22:19:17 +00:00
|
|
|
handler->LogData(blob);
|
2017-12-01 07:39:56 +00:00
|
|
|
// A batch might have nothing but LogData. It is still a batch.
|
|
|
|
empty_batch = false;
|
2013-08-14 23:32:46 +00:00
|
|
|
break;
|
Modification of WriteBatch to support two phase commit
Summary: Adds three new WriteBatch data types: Prepare(xid), Commit(xid), Rollback(xid). Prepare(xid) should precede the (single) operation to which is applies. There can obviously be multiple Prepare(xid) markers. There should only be one Rollback(xid) or Commit(xid) marker yet not both. None of this logic is currently enforced and will most likely be implemented further up such as in the memtableinserter. All three markers are similar to PutLogData in that they are writebatch meta-data, ie stored but not counted. All three markers differ from PutLogData in that they will actually be written to disk. As for WriteBatchWithIndex, Prepare, Commit, Rollback are all implemented just as PutLogData and none are tested just as PutLogData.
Test Plan: single unit test in write_batch_test.
Reviewers: hermanlee4, sdong, anthony
Subscribers: leveldb, dhruba, vasilep, andrewkr
Differential Revision: https://reviews.facebook.net/D57867
2016-04-08 06:35:51 +00:00
|
|
|
case kTypeBeginPrepareXID:
|
2019-07-31 20:36:22 +00:00
|
|
|
assert(wb->content_flags_.load(std::memory_order_relaxed) &
|
Modification of WriteBatch to support two phase commit
Summary: Adds three new WriteBatch data types: Prepare(xid), Commit(xid), Rollback(xid). Prepare(xid) should precede the (single) operation to which is applies. There can obviously be multiple Prepare(xid) markers. There should only be one Rollback(xid) or Commit(xid) marker yet not both. None of this logic is currently enforced and will most likely be implemented further up such as in the memtableinserter. All three markers are similar to PutLogData in that they are writebatch meta-data, ie stored but not counted. All three markers differ from PutLogData in that they will actually be written to disk. As for WriteBatchWithIndex, Prepare, Commit, Rollback are all implemented just as PutLogData and none are tested just as PutLogData.
Test Plan: single unit test in write_batch_test.
Reviewers: hermanlee4, sdong, anthony
Subscribers: leveldb, dhruba, vasilep, andrewkr
Differential Revision: https://reviews.facebook.net/D57867
2016-04-08 06:35:51 +00:00
|
|
|
(ContentFlags::DEFERRED | ContentFlags::HAS_BEGIN_PREPARE));
|
2020-10-20 20:17:17 +00:00
|
|
|
s = handler->MarkBeginPrepare();
|
|
|
|
assert(s.ok());
|
2017-09-28 23:43:04 +00:00
|
|
|
empty_batch = false;
|
2022-04-28 21:42:00 +00:00
|
|
|
if (handler->WriteAfterCommit() ==
|
|
|
|
WriteBatch::Handler::OptionState::kDisabled) {
|
2017-11-11 19:23:43 +00:00
|
|
|
s = Status::NotSupported(
|
|
|
|
"WriteCommitted txn tag when write_after_commit_ is disabled (in "
|
2018-06-29 01:46:39 +00:00
|
|
|
"WritePrepared/WriteUnprepared mode). If it is not due to "
|
|
|
|
"corruption, the WAL must be emptied before changing the "
|
|
|
|
"WritePolicy.");
|
|
|
|
}
|
2022-04-28 21:42:00 +00:00
|
|
|
if (handler->WriteBeforePrepare() ==
|
|
|
|
WriteBatch::Handler::OptionState::kEnabled) {
|
2018-06-29 01:46:39 +00:00
|
|
|
s = Status::NotSupported(
|
|
|
|
"WriteCommitted txn tag when write_before_prepare_ is enabled "
|
|
|
|
"(in WriteUnprepared mode). If it is not due to corruption, the "
|
|
|
|
"WAL must be emptied before changing the WritePolicy.");
|
2017-11-11 19:23:43 +00:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
case kTypeBeginPersistedPrepareXID:
|
2019-07-31 20:36:22 +00:00
|
|
|
assert(wb->content_flags_.load(std::memory_order_relaxed) &
|
2017-11-11 19:23:43 +00:00
|
|
|
(ContentFlags::DEFERRED | ContentFlags::HAS_BEGIN_PREPARE));
|
2020-10-20 20:17:17 +00:00
|
|
|
s = handler->MarkBeginPrepare();
|
|
|
|
assert(s.ok());
|
2017-11-11 19:23:43 +00:00
|
|
|
empty_batch = false;
|
2022-04-28 21:42:00 +00:00
|
|
|
if (handler->WriteAfterCommit() ==
|
|
|
|
WriteBatch::Handler::OptionState::kEnabled) {
|
2017-11-11 19:23:43 +00:00
|
|
|
s = Status::NotSupported(
|
2018-06-29 01:46:39 +00:00
|
|
|
"WritePrepared/WriteUnprepared txn tag when write_after_commit_ "
|
|
|
|
"is enabled (in default WriteCommitted mode). If it is not due "
|
|
|
|
"to corruption, the WAL must be emptied before changing the "
|
|
|
|
"WritePolicy.");
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case kTypeBeginUnprepareXID:
|
2019-07-31 20:36:22 +00:00
|
|
|
assert(wb->content_flags_.load(std::memory_order_relaxed) &
|
2018-07-07 00:17:36 +00:00
|
|
|
(ContentFlags::DEFERRED | ContentFlags::HAS_BEGIN_UNPREPARE));
|
2020-10-20 20:17:17 +00:00
|
|
|
s = handler->MarkBeginPrepare(true /* unprepared */);
|
|
|
|
assert(s.ok());
|
2018-06-29 01:46:39 +00:00
|
|
|
empty_batch = false;
|
2022-04-28 21:42:00 +00:00
|
|
|
if (handler->WriteAfterCommit() ==
|
|
|
|
WriteBatch::Handler::OptionState::kEnabled) {
|
2018-06-29 01:46:39 +00:00
|
|
|
s = Status::NotSupported(
|
|
|
|
"WriteUnprepared txn tag when write_after_commit_ is enabled (in "
|
2017-11-11 19:23:43 +00:00
|
|
|
"default WriteCommitted mode). If it is not due to corruption, "
|
|
|
|
"the WAL must be emptied before changing the WritePolicy.");
|
|
|
|
}
|
2022-04-28 21:42:00 +00:00
|
|
|
if (handler->WriteBeforePrepare() ==
|
|
|
|
WriteBatch::Handler::OptionState::kDisabled) {
|
2018-06-29 01:46:39 +00:00
|
|
|
s = Status::NotSupported(
|
|
|
|
"WriteUnprepared txn tag when write_before_prepare_ is disabled "
|
|
|
|
"(in WriteCommitted/WritePrepared mode). If it is not due to "
|
|
|
|
"corruption, the WAL must be emptied before changing the "
|
|
|
|
"WritePolicy.");
|
|
|
|
}
|
Modification of WriteBatch to support two phase commit
Summary: Adds three new WriteBatch data types: Prepare(xid), Commit(xid), Rollback(xid). Prepare(xid) should precede the (single) operation to which is applies. There can obviously be multiple Prepare(xid) markers. There should only be one Rollback(xid) or Commit(xid) marker yet not both. None of this logic is currently enforced and will most likely be implemented further up such as in the memtableinserter. All three markers are similar to PutLogData in that they are writebatch meta-data, ie stored but not counted. All three markers differ from PutLogData in that they will actually be written to disk. As for WriteBatchWithIndex, Prepare, Commit, Rollback are all implemented just as PutLogData and none are tested just as PutLogData.
Test Plan: single unit test in write_batch_test.
Reviewers: hermanlee4, sdong, anthony
Subscribers: leveldb, dhruba, vasilep, andrewkr
Differential Revision: https://reviews.facebook.net/D57867
2016-04-08 06:35:51 +00:00
|
|
|
break;
|
|
|
|
case kTypeEndPrepareXID:
|
2019-07-31 20:36:22 +00:00
|
|
|
assert(wb->content_flags_.load(std::memory_order_relaxed) &
|
Modification of WriteBatch to support two phase commit
Summary: Adds three new WriteBatch data types: Prepare(xid), Commit(xid), Rollback(xid). Prepare(xid) should precede the (single) operation to which is applies. There can obviously be multiple Prepare(xid) markers. There should only be one Rollback(xid) or Commit(xid) marker yet not both. None of this logic is currently enforced and will most likely be implemented further up such as in the memtableinserter. All three markers are similar to PutLogData in that they are writebatch meta-data, ie stored but not counted. All three markers differ from PutLogData in that they will actually be written to disk. As for WriteBatchWithIndex, Prepare, Commit, Rollback are all implemented just as PutLogData and none are tested just as PutLogData.
Test Plan: single unit test in write_batch_test.
Reviewers: hermanlee4, sdong, anthony
Subscribers: leveldb, dhruba, vasilep, andrewkr
Differential Revision: https://reviews.facebook.net/D57867
2016-04-08 06:35:51 +00:00
|
|
|
(ContentFlags::DEFERRED | ContentFlags::HAS_END_PREPARE));
|
2020-10-20 20:17:17 +00:00
|
|
|
s = handler->MarkEndPrepare(xid);
|
|
|
|
assert(s.ok());
|
2017-09-28 23:43:04 +00:00
|
|
|
empty_batch = true;
|
Modification of WriteBatch to support two phase commit
Summary: Adds three new WriteBatch data types: Prepare(xid), Commit(xid), Rollback(xid). Prepare(xid) should precede the (single) operation to which is applies. There can obviously be multiple Prepare(xid) markers. There should only be one Rollback(xid) or Commit(xid) marker yet not both. None of this logic is currently enforced and will most likely be implemented further up such as in the memtableinserter. All three markers are similar to PutLogData in that they are writebatch meta-data, ie stored but not counted. All three markers differ from PutLogData in that they will actually be written to disk. As for WriteBatchWithIndex, Prepare, Commit, Rollback are all implemented just as PutLogData and none are tested just as PutLogData.
Test Plan: single unit test in write_batch_test.
Reviewers: hermanlee4, sdong, anthony
Subscribers: leveldb, dhruba, vasilep, andrewkr
Differential Revision: https://reviews.facebook.net/D57867
2016-04-08 06:35:51 +00:00
|
|
|
break;
|
|
|
|
case kTypeCommitXID:
|
2019-07-31 20:36:22 +00:00
|
|
|
assert(wb->content_flags_.load(std::memory_order_relaxed) &
|
Modification of WriteBatch to support two phase commit
Summary: Adds three new WriteBatch data types: Prepare(xid), Commit(xid), Rollback(xid). Prepare(xid) should precede the (single) operation to which is applies. There can obviously be multiple Prepare(xid) markers. There should only be one Rollback(xid) or Commit(xid) marker yet not both. None of this logic is currently enforced and will most likely be implemented further up such as in the memtableinserter. All three markers are similar to PutLogData in that they are writebatch meta-data, ie stored but not counted. All three markers differ from PutLogData in that they will actually be written to disk. As for WriteBatchWithIndex, Prepare, Commit, Rollback are all implemented just as PutLogData and none are tested just as PutLogData.
Test Plan: single unit test in write_batch_test.
Reviewers: hermanlee4, sdong, anthony
Subscribers: leveldb, dhruba, vasilep, andrewkr
Differential Revision: https://reviews.facebook.net/D57867
2016-04-08 06:35:51 +00:00
|
|
|
(ContentFlags::DEFERRED | ContentFlags::HAS_COMMIT));
|
2020-10-20 20:17:17 +00:00
|
|
|
s = handler->MarkCommit(xid);
|
|
|
|
assert(s.ok());
|
2017-09-28 23:43:04 +00:00
|
|
|
empty_batch = true;
|
Modification of WriteBatch to support two phase commit
Summary: Adds three new WriteBatch data types: Prepare(xid), Commit(xid), Rollback(xid). Prepare(xid) should precede the (single) operation to which is applies. There can obviously be multiple Prepare(xid) markers. There should only be one Rollback(xid) or Commit(xid) marker yet not both. None of this logic is currently enforced and will most likely be implemented further up such as in the memtableinserter. All three markers are similar to PutLogData in that they are writebatch meta-data, ie stored but not counted. All three markers differ from PutLogData in that they will actually be written to disk. As for WriteBatchWithIndex, Prepare, Commit, Rollback are all implemented just as PutLogData and none are tested just as PutLogData.
Test Plan: single unit test in write_batch_test.
Reviewers: hermanlee4, sdong, anthony
Subscribers: leveldb, dhruba, vasilep, andrewkr
Differential Revision: https://reviews.facebook.net/D57867
2016-04-08 06:35:51 +00:00
|
|
|
break;
|
2021-12-10 19:03:39 +00:00
|
|
|
case kTypeCommitXIDAndTimestamp:
|
|
|
|
assert(wb->content_flags_.load(std::memory_order_relaxed) &
|
|
|
|
(ContentFlags::DEFERRED | ContentFlags::HAS_COMMIT));
|
|
|
|
// key stores the commit timestamp.
|
|
|
|
assert(!key.empty());
|
|
|
|
s = handler->MarkCommitWithTimestamp(xid, key);
|
|
|
|
if (LIKELY(s.ok())) {
|
|
|
|
empty_batch = true;
|
|
|
|
}
|
|
|
|
break;
|
Modification of WriteBatch to support two phase commit
Summary: Adds three new WriteBatch data types: Prepare(xid), Commit(xid), Rollback(xid). Prepare(xid) should precede the (single) operation to which is applies. There can obviously be multiple Prepare(xid) markers. There should only be one Rollback(xid) or Commit(xid) marker yet not both. None of this logic is currently enforced and will most likely be implemented further up such as in the memtableinserter. All three markers are similar to PutLogData in that they are writebatch meta-data, ie stored but not counted. All three markers differ from PutLogData in that they will actually be written to disk. As for WriteBatchWithIndex, Prepare, Commit, Rollback are all implemented just as PutLogData and none are tested just as PutLogData.
Test Plan: single unit test in write_batch_test.
Reviewers: hermanlee4, sdong, anthony
Subscribers: leveldb, dhruba, vasilep, andrewkr
Differential Revision: https://reviews.facebook.net/D57867
2016-04-08 06:35:51 +00:00
|
|
|
case kTypeRollbackXID:
|
2019-07-31 20:36:22 +00:00
|
|
|
assert(wb->content_flags_.load(std::memory_order_relaxed) &
|
Modification of WriteBatch to support two phase commit
Summary: Adds three new WriteBatch data types: Prepare(xid), Commit(xid), Rollback(xid). Prepare(xid) should precede the (single) operation to which is applies. There can obviously be multiple Prepare(xid) markers. There should only be one Rollback(xid) or Commit(xid) marker yet not both. None of this logic is currently enforced and will most likely be implemented further up such as in the memtableinserter. All three markers are similar to PutLogData in that they are writebatch meta-data, ie stored but not counted. All three markers differ from PutLogData in that they will actually be written to disk. As for WriteBatchWithIndex, Prepare, Commit, Rollback are all implemented just as PutLogData and none are tested just as PutLogData.
Test Plan: single unit test in write_batch_test.
Reviewers: hermanlee4, sdong, anthony
Subscribers: leveldb, dhruba, vasilep, andrewkr
Differential Revision: https://reviews.facebook.net/D57867
2016-04-08 06:35:51 +00:00
|
|
|
(ContentFlags::DEFERRED | ContentFlags::HAS_ROLLBACK));
|
2020-10-20 20:17:17 +00:00
|
|
|
s = handler->MarkRollback(xid);
|
|
|
|
assert(s.ok());
|
2017-09-28 23:43:04 +00:00
|
|
|
empty_batch = true;
|
Modification of WriteBatch to support two phase commit
Summary: Adds three new WriteBatch data types: Prepare(xid), Commit(xid), Rollback(xid). Prepare(xid) should precede the (single) operation to which is applies. There can obviously be multiple Prepare(xid) markers. There should only be one Rollback(xid) or Commit(xid) marker yet not both. None of this logic is currently enforced and will most likely be implemented further up such as in the memtableinserter. All three markers are similar to PutLogData in that they are writebatch meta-data, ie stored but not counted. All three markers differ from PutLogData in that they will actually be written to disk. As for WriteBatchWithIndex, Prepare, Commit, Rollback are all implemented just as PutLogData and none are tested just as PutLogData.
Test Plan: single unit test in write_batch_test.
Reviewers: hermanlee4, sdong, anthony
Subscribers: leveldb, dhruba, vasilep, andrewkr
Differential Revision: https://reviews.facebook.net/D57867
2016-04-08 06:35:51 +00:00
|
|
|
break;
|
|
|
|
case kTypeNoop:
|
2020-10-20 20:17:17 +00:00
|
|
|
s = handler->MarkNoop(empty_batch);
|
|
|
|
assert(s.ok());
|
2017-09-28 23:43:04 +00:00
|
|
|
empty_batch = true;
|
Modification of WriteBatch to support two phase commit
Summary: Adds three new WriteBatch data types: Prepare(xid), Commit(xid), Rollback(xid). Prepare(xid) should precede the (single) operation to which is applies. There can obviously be multiple Prepare(xid) markers. There should only be one Rollback(xid) or Commit(xid) marker yet not both. None of this logic is currently enforced and will most likely be implemented further up such as in the memtableinserter. All three markers are similar to PutLogData in that they are writebatch meta-data, ie stored but not counted. All three markers differ from PutLogData in that they will actually be written to disk. As for WriteBatchWithIndex, Prepare, Commit, Rollback are all implemented just as PutLogData and none are tested just as PutLogData.
Test Plan: single unit test in write_batch_test.
Reviewers: hermanlee4, sdong, anthony
Subscribers: leveldb, dhruba, vasilep, andrewkr
Differential Revision: https://reviews.facebook.net/D57867
2016-04-08 06:35:51 +00:00
|
|
|
break;
|
2022-06-25 22:30:47 +00:00
|
|
|
case kTypeWideColumnEntity:
|
|
|
|
case kTypeColumnFamilyWideColumnEntity:
|
|
|
|
assert(wb->content_flags_.load(std::memory_order_relaxed) &
|
|
|
|
(ContentFlags::DEFERRED | ContentFlags::HAS_PUT_ENTITY));
|
|
|
|
s = handler->PutEntityCF(column_family, key, value);
|
|
|
|
if (LIKELY(s.ok())) {
|
|
|
|
empty_batch = false;
|
|
|
|
++found;
|
|
|
|
}
|
|
|
|
break;
|
Add initial support for TimedPut API (#12419)
Summary:
This PR adds support for `TimedPut` API. We introduced a new type `kTypeValuePreferredSeqno` for entries added to the DB via the `TimedPut` API.
The life cycle of such an entry on the write/flush/compaction paths are:
1) It is initially added to memtable as:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, write_unix_time}`
2) When it's flushed to L0 sst files, it's converted to:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, preferred_seqno}`
when we have easy access to the seqno to time mapping.
3) During compaction, if certain conditions are met, we swap in the `preferred_seqno` and the entry will become:
`<user_key, preferred_seqno, kTypeValue>: value`. This step helps fast track these entries to the cold tier if they are eligible after the sequence number swap.
On the read path:
A `kTypeValuePreferredSeqno` entry acts the same as a `kTypeValue` entry, the unix_write_time/preferred seqno part packed in value is completely ignored.
Needed follow ups:
1) The seqno to time mapping accessible in flush needs to be extended to cover the `write_unix_time` for possible `kTypeValuePreferredSeqno` entries. This also means we need to track these `write_unix_time` in memtable.
2) Compaction filter support for the new `kTypeValuePreferredSeqno` type for feature parity with other `kTypeValue` and equivalent types.
3) Stress test coverage for the feature
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12419
Test Plan: Added unit tests
Reviewed By: pdillinger
Differential Revision: D54920296
Pulled By: jowlyzhang
fbshipit-source-id: c8b43f7a7c465e569141770e93c748371ff1da9e
2024-03-14 22:44:55 +00:00
|
|
|
case kTypeValuePreferredSeqno:
|
|
|
|
case kTypeColumnFamilyValuePreferredSeqno:
|
|
|
|
assert(wb->content_flags_.load(std::memory_order_relaxed) &
|
|
|
|
(ContentFlags::DEFERRED | ContentFlags::HAS_TIMED_PUT));
|
|
|
|
s = handler->TimedPutCF(column_family, key, value, write_unix_time);
|
|
|
|
if (LIKELY(s.ok())) {
|
|
|
|
empty_batch = false;
|
|
|
|
++found;
|
|
|
|
}
|
|
|
|
break;
|
2011-05-21 02:17:43 +00:00
|
|
|
default:
|
2024-10-10 22:34:35 +00:00
|
|
|
return Status::Corruption(
|
|
|
|
"unknown WriteBatch tag",
|
|
|
|
std::to_string(static_cast<unsigned int>(tag)));
|
2011-05-21 02:17:43 +00:00
|
|
|
}
|
|
|
|
}
|
2014-02-26 01:30:54 +00:00
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
2019-07-31 20:36:22 +00:00
|
|
|
if (handler_continue && whole_batch &&
|
|
|
|
found != WriteBatchInternal::Count(wb)) {
|
2011-05-21 02:17:43 +00:00
|
|
|
return Status::Corruption("WriteBatch has wrong count");
|
|
|
|
} else {
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-11-02 00:23:52 +00:00
|
|
|
bool WriteBatchInternal::IsLatestPersistentState(const WriteBatch* b) {
|
|
|
|
return b->is_latest_persistent_state_;
|
|
|
|
}
|
|
|
|
|
2021-07-30 19:06:47 +00:00
|
|
|
void WriteBatchInternal::SetAsLatestPersistentState(WriteBatch* b) {
|
2017-11-02 00:23:52 +00:00
|
|
|
b->is_latest_persistent_state_ = true;
|
|
|
|
}
|
|
|
|
|
2019-09-09 18:22:28 +00:00
|
|
|
uint32_t WriteBatchInternal::Count(const WriteBatch* b) {
|
2011-03-18 22:37:00 +00:00
|
|
|
return DecodeFixed32(b->rep_.data() + 8);
|
|
|
|
}
|
|
|
|
|
2019-09-09 18:22:28 +00:00
|
|
|
void WriteBatchInternal::SetCount(WriteBatch* b, uint32_t n) {
|
2011-03-18 22:37:00 +00:00
|
|
|
EncodeFixed32(&b->rep_[8], n);
|
|
|
|
}
|
|
|
|
|
|
|
|
SequenceNumber WriteBatchInternal::Sequence(const WriteBatch* b) {
|
|
|
|
return SequenceNumber(DecodeFixed64(b->rep_.data()));
|
|
|
|
}
|
|
|
|
|
|
|
|
void WriteBatchInternal::SetSequence(WriteBatch* b, SequenceNumber seq) {
|
2023-12-04 19:17:32 +00:00
|
|
|
EncodeFixed64(b->rep_.data(), seq);
|
2011-03-18 22:37:00 +00:00
|
|
|
}
|
|
|
|
|
2018-03-05 21:08:17 +00:00
|
|
|
size_t WriteBatchInternal::GetFirstOffset(WriteBatch* /*b*/) {
|
2016-04-01 22:23:46 +00:00
|
|
|
return WriteBatchInternal::kHeader;
|
2016-03-31 02:56:55 +00:00
|
|
|
}
|
2015-07-11 03:15:45 +00:00
|
|
|
|
2023-08-09 20:49:42 +00:00
|
|
|
void WriteBatchInternal::SetDefaultColumnFamilyTimestampSize(
|
|
|
|
WriteBatch* wb, size_t default_cf_ts_sz) {
|
|
|
|
wb->default_cf_ts_sz_ = default_cf_ts_sz;
|
|
|
|
}
|
|
|
|
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
std::tuple<Status, uint32_t, size_t>
|
|
|
|
WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(
|
|
|
|
WriteBatch* b, ColumnFamilyHandle* column_family) {
|
|
|
|
uint32_t cf_id = GetColumnFamilyID(column_family);
|
|
|
|
size_t ts_sz = 0;
|
|
|
|
Status s;
|
|
|
|
if (column_family) {
|
|
|
|
const Comparator* const ucmp = column_family->GetComparator();
|
|
|
|
if (ucmp) {
|
|
|
|
ts_sz = ucmp->timestamp_size();
|
|
|
|
if (0 == cf_id && b->default_cf_ts_sz_ != ts_sz) {
|
|
|
|
s = Status::InvalidArgument("Default cf timestamp size mismatch");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else if (b->default_cf_ts_sz_ > 0) {
|
|
|
|
ts_sz = b->default_cf_ts_sz_;
|
|
|
|
}
|
|
|
|
return std::make_tuple(s, cf_id, ts_sz);
|
|
|
|
}
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
Status CheckColumnFamilyTimestampSize(ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& ts) {
|
|
|
|
if (!column_family) {
|
|
|
|
return Status::InvalidArgument("column family handle cannot be null");
|
|
|
|
}
|
|
|
|
const Comparator* const ucmp = column_family->GetComparator();
|
|
|
|
assert(ucmp);
|
|
|
|
size_t cf_ts_sz = ucmp->timestamp_size();
|
|
|
|
if (0 == cf_ts_sz) {
|
|
|
|
return Status::InvalidArgument("timestamp disabled");
|
|
|
|
}
|
|
|
|
if (cf_ts_sz != ts.size()) {
|
|
|
|
return Status::InvalidArgument("timestamp size mismatch");
|
|
|
|
}
|
|
|
|
return Status::OK();
|
|
|
|
}
|
2022-11-02 21:34:24 +00:00
|
|
|
} // anonymous namespace
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
|
2017-04-10 22:38:34 +00:00
|
|
|
Status WriteBatchInternal::Put(WriteBatch* b, uint32_t column_family_id,
|
|
|
|
const Slice& key, const Slice& value) {
|
2022-05-05 20:08:21 +00:00
|
|
|
if (key.size() > size_t{std::numeric_limits<uint32_t>::max()}) {
|
2018-02-09 22:50:09 +00:00
|
|
|
return Status::InvalidArgument("key is too large");
|
|
|
|
}
|
2022-05-05 20:08:21 +00:00
|
|
|
if (value.size() > size_t{std::numeric_limits<uint32_t>::max()}) {
|
2018-02-09 22:50:09 +00:00
|
|
|
return Status::InvalidArgument("value is too large");
|
|
|
|
}
|
|
|
|
|
2017-04-10 22:38:34 +00:00
|
|
|
LocalSavePoint save(b);
|
2014-04-22 18:27:33 +00:00
|
|
|
WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
|
|
|
|
if (column_family_id == 0) {
|
|
|
|
b->rep_.push_back(static_cast<char>(kTypeValue));
|
|
|
|
} else {
|
|
|
|
b->rep_.push_back(static_cast<char>(kTypeColumnFamilyValue));
|
|
|
|
PutVarint32(&b->rep_, column_family_id);
|
|
|
|
}
|
2021-09-12 22:33:15 +00:00
|
|
|
PutLengthPrefixedSlice(&b->rep_, key);
|
2014-04-22 18:27:33 +00:00
|
|
|
PutLengthPrefixedSlice(&b->rep_, value);
|
2015-11-06 15:03:30 +00:00
|
|
|
b->content_flags_.store(
|
|
|
|
b->content_flags_.load(std::memory_order_relaxed) | ContentFlags::HAS_PUT,
|
|
|
|
std::memory_order_relaxed);
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
if (b->prot_info_ != nullptr) {
|
|
|
|
// Technically the optype could've been `kTypeColumnFamilyValue` with the
|
|
|
|
// CF ID encoded in the `WriteBatch`. That distinction is unimportant
|
|
|
|
// however since we verify CF ID is correct, as well as all other fields
|
|
|
|
// (a missing/extra encoded CF ID would corrupt another field). It is
|
|
|
|
// convenient to consolidate on `kTypeValue` here as that is what will be
|
|
|
|
// inserted into memtable.
|
2021-09-14 20:13:36 +00:00
|
|
|
b->prot_info_->entries_.emplace_back(ProtectionInfo64()
|
|
|
|
.ProtectKVO(key, value, kTypeValue)
|
|
|
|
.ProtectC(column_family_id));
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
}
|
2017-04-10 22:38:34 +00:00
|
|
|
return save.commit();
|
2014-04-22 18:27:33 +00:00
|
|
|
}
|
|
|
|
|
Add initial support for TimedPut API (#12419)
Summary:
This PR adds support for `TimedPut` API. We introduced a new type `kTypeValuePreferredSeqno` for entries added to the DB via the `TimedPut` API.
The life cycle of such an entry on the write/flush/compaction paths are:
1) It is initially added to memtable as:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, write_unix_time}`
2) When it's flushed to L0 sst files, it's converted to:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, preferred_seqno}`
when we have easy access to the seqno to time mapping.
3) During compaction, if certain conditions are met, we swap in the `preferred_seqno` and the entry will become:
`<user_key, preferred_seqno, kTypeValue>: value`. This step helps fast track these entries to the cold tier if they are eligible after the sequence number swap.
On the read path:
A `kTypeValuePreferredSeqno` entry acts the same as a `kTypeValue` entry, the unix_write_time/preferred seqno part packed in value is completely ignored.
Needed follow ups:
1) The seqno to time mapping accessible in flush needs to be extended to cover the `write_unix_time` for possible `kTypeValuePreferredSeqno` entries. This also means we need to track these `write_unix_time` in memtable.
2) Compaction filter support for the new `kTypeValuePreferredSeqno` type for feature parity with other `kTypeValue` and equivalent types.
3) Stress test coverage for the feature
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12419
Test Plan: Added unit tests
Reviewed By: pdillinger
Differential Revision: D54920296
Pulled By: jowlyzhang
fbshipit-source-id: c8b43f7a7c465e569141770e93c748371ff1da9e
2024-03-14 22:44:55 +00:00
|
|
|
Status WriteBatchInternal::TimedPut(WriteBatch* b, uint32_t column_family_id,
|
|
|
|
const Slice& key, const Slice& value,
|
|
|
|
uint64_t write_unix_time) {
|
|
|
|
if (key.size() > size_t{std::numeric_limits<uint32_t>::max()}) {
|
|
|
|
return Status::InvalidArgument("key is too large");
|
|
|
|
}
|
|
|
|
if (value.size() > size_t{std::numeric_limits<uint32_t>::max()}) {
|
|
|
|
return Status::InvalidArgument("value is too large");
|
|
|
|
}
|
2024-03-21 17:00:15 +00:00
|
|
|
if (std::numeric_limits<uint64_t>::max() == write_unix_time) {
|
|
|
|
return WriteBatchInternal::Put(b, column_family_id, key, value);
|
|
|
|
}
|
Add initial support for TimedPut API (#12419)
Summary:
This PR adds support for `TimedPut` API. We introduced a new type `kTypeValuePreferredSeqno` for entries added to the DB via the `TimedPut` API.
The life cycle of such an entry on the write/flush/compaction paths are:
1) It is initially added to memtable as:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, write_unix_time}`
2) When it's flushed to L0 sst files, it's converted to:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, preferred_seqno}`
when we have easy access to the seqno to time mapping.
3) During compaction, if certain conditions are met, we swap in the `preferred_seqno` and the entry will become:
`<user_key, preferred_seqno, kTypeValue>: value`. This step helps fast track these entries to the cold tier if they are eligible after the sequence number swap.
On the read path:
A `kTypeValuePreferredSeqno` entry acts the same as a `kTypeValue` entry, the unix_write_time/preferred seqno part packed in value is completely ignored.
Needed follow ups:
1) The seqno to time mapping accessible in flush needs to be extended to cover the `write_unix_time` for possible `kTypeValuePreferredSeqno` entries. This also means we need to track these `write_unix_time` in memtable.
2) Compaction filter support for the new `kTypeValuePreferredSeqno` type for feature parity with other `kTypeValue` and equivalent types.
3) Stress test coverage for the feature
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12419
Test Plan: Added unit tests
Reviewed By: pdillinger
Differential Revision: D54920296
Pulled By: jowlyzhang
fbshipit-source-id: c8b43f7a7c465e569141770e93c748371ff1da9e
2024-03-14 22:44:55 +00:00
|
|
|
LocalSavePoint save(b);
|
|
|
|
|
|
|
|
WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
|
|
|
|
if (column_family_id == 0) {
|
|
|
|
b->rep_.push_back(static_cast<char>(kTypeValuePreferredSeqno));
|
|
|
|
} else {
|
|
|
|
b->rep_.push_back(static_cast<char>(kTypeColumnFamilyValuePreferredSeqno));
|
|
|
|
PutVarint32(&b->rep_, column_family_id);
|
|
|
|
}
|
2024-04-30 22:40:35 +00:00
|
|
|
std::string value_buf;
|
|
|
|
Slice packed_value =
|
|
|
|
PackValueAndWriteTime(value, write_unix_time, &value_buf);
|
Add initial support for TimedPut API (#12419)
Summary:
This PR adds support for `TimedPut` API. We introduced a new type `kTypeValuePreferredSeqno` for entries added to the DB via the `TimedPut` API.
The life cycle of such an entry on the write/flush/compaction paths are:
1) It is initially added to memtable as:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, write_unix_time}`
2) When it's flushed to L0 sst files, it's converted to:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, preferred_seqno}`
when we have easy access to the seqno to time mapping.
3) During compaction, if certain conditions are met, we swap in the `preferred_seqno` and the entry will become:
`<user_key, preferred_seqno, kTypeValue>: value`. This step helps fast track these entries to the cold tier if they are eligible after the sequence number swap.
On the read path:
A `kTypeValuePreferredSeqno` entry acts the same as a `kTypeValue` entry, the unix_write_time/preferred seqno part packed in value is completely ignored.
Needed follow ups:
1) The seqno to time mapping accessible in flush needs to be extended to cover the `write_unix_time` for possible `kTypeValuePreferredSeqno` entries. This also means we need to track these `write_unix_time` in memtable.
2) Compaction filter support for the new `kTypeValuePreferredSeqno` type for feature parity with other `kTypeValue` and equivalent types.
3) Stress test coverage for the feature
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12419
Test Plan: Added unit tests
Reviewed By: pdillinger
Differential Revision: D54920296
Pulled By: jowlyzhang
fbshipit-source-id: c8b43f7a7c465e569141770e93c748371ff1da9e
2024-03-14 22:44:55 +00:00
|
|
|
PutLengthPrefixedSlice(&b->rep_, key);
|
2024-04-30 22:40:35 +00:00
|
|
|
PutLengthPrefixedSlice(&b->rep_, packed_value);
|
Add initial support for TimedPut API (#12419)
Summary:
This PR adds support for `TimedPut` API. We introduced a new type `kTypeValuePreferredSeqno` for entries added to the DB via the `TimedPut` API.
The life cycle of such an entry on the write/flush/compaction paths are:
1) It is initially added to memtable as:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, write_unix_time}`
2) When it's flushed to L0 sst files, it's converted to:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, preferred_seqno}`
when we have easy access to the seqno to time mapping.
3) During compaction, if certain conditions are met, we swap in the `preferred_seqno` and the entry will become:
`<user_key, preferred_seqno, kTypeValue>: value`. This step helps fast track these entries to the cold tier if they are eligible after the sequence number swap.
On the read path:
A `kTypeValuePreferredSeqno` entry acts the same as a `kTypeValue` entry, the unix_write_time/preferred seqno part packed in value is completely ignored.
Needed follow ups:
1) The seqno to time mapping accessible in flush needs to be extended to cover the `write_unix_time` for possible `kTypeValuePreferredSeqno` entries. This also means we need to track these `write_unix_time` in memtable.
2) Compaction filter support for the new `kTypeValuePreferredSeqno` type for feature parity with other `kTypeValue` and equivalent types.
3) Stress test coverage for the feature
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12419
Test Plan: Added unit tests
Reviewed By: pdillinger
Differential Revision: D54920296
Pulled By: jowlyzhang
fbshipit-source-id: c8b43f7a7c465e569141770e93c748371ff1da9e
2024-03-14 22:44:55 +00:00
|
|
|
|
|
|
|
b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
|
|
|
|
ContentFlags::HAS_TIMED_PUT,
|
|
|
|
std::memory_order_relaxed);
|
|
|
|
if (b->prot_info_ != nullptr) {
|
|
|
|
// See comment in other internal functions for why we don't need to
|
|
|
|
// differentiate between `kTypeValuePreferredSeqno` and
|
|
|
|
// `kTypeColumnFamilyValuePreferredSeqno` here.
|
|
|
|
b->prot_info_->entries_.emplace_back(
|
|
|
|
ProtectionInfo64()
|
2024-04-30 22:40:35 +00:00
|
|
|
.ProtectKVO(key, packed_value, kTypeValuePreferredSeqno)
|
Add initial support for TimedPut API (#12419)
Summary:
This PR adds support for `TimedPut` API. We introduced a new type `kTypeValuePreferredSeqno` for entries added to the DB via the `TimedPut` API.
The life cycle of such an entry on the write/flush/compaction paths are:
1) It is initially added to memtable as:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, write_unix_time}`
2) When it's flushed to L0 sst files, it's converted to:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, preferred_seqno}`
when we have easy access to the seqno to time mapping.
3) During compaction, if certain conditions are met, we swap in the `preferred_seqno` and the entry will become:
`<user_key, preferred_seqno, kTypeValue>: value`. This step helps fast track these entries to the cold tier if they are eligible after the sequence number swap.
On the read path:
A `kTypeValuePreferredSeqno` entry acts the same as a `kTypeValue` entry, the unix_write_time/preferred seqno part packed in value is completely ignored.
Needed follow ups:
1) The seqno to time mapping accessible in flush needs to be extended to cover the `write_unix_time` for possible `kTypeValuePreferredSeqno` entries. This also means we need to track these `write_unix_time` in memtable.
2) Compaction filter support for the new `kTypeValuePreferredSeqno` type for feature parity with other `kTypeValue` and equivalent types.
3) Stress test coverage for the feature
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12419
Test Plan: Added unit tests
Reviewed By: pdillinger
Differential Revision: D54920296
Pulled By: jowlyzhang
fbshipit-source-id: c8b43f7a7c465e569141770e93c748371ff1da9e
2024-03-14 22:44:55 +00:00
|
|
|
.ProtectC(column_family_id));
|
|
|
|
}
|
|
|
|
return save.commit();
|
|
|
|
}
|
|
|
|
|
2017-04-10 22:38:34 +00:00
|
|
|
Status WriteBatch::Put(ColumnFamilyHandle* column_family, const Slice& key,
|
|
|
|
const Slice& value) {
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
size_t ts_sz = 0;
|
|
|
|
uint32_t cf_id = 0;
|
|
|
|
Status s;
|
|
|
|
|
|
|
|
std::tie(s, cf_id, ts_sz) =
|
|
|
|
WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this,
|
|
|
|
column_family);
|
|
|
|
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (0 == ts_sz) {
|
2024-08-05 20:06:45 +00:00
|
|
|
s = WriteBatchInternal::Put(this, cf_id, key, value);
|
|
|
|
} else {
|
|
|
|
needs_in_place_update_ts_ = true;
|
|
|
|
has_key_with_ts_ = true;
|
|
|
|
std::string dummy_ts(ts_sz, '\0');
|
|
|
|
std::array<Slice, 2> key_with_ts{{key, dummy_ts}};
|
|
|
|
s = WriteBatchInternal::Put(this, cf_id, SliceParts(key_with_ts.data(), 2),
|
|
|
|
SliceParts(&value, 1));
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
}
|
2024-08-05 20:06:45 +00:00
|
|
|
if (s.ok()) {
|
|
|
|
MaybeTrackTimestampSize(cf_id, ts_sz);
|
|
|
|
}
|
|
|
|
return s;
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
}
|
|
|
|
|
Add initial support for TimedPut API (#12419)
Summary:
This PR adds support for `TimedPut` API. We introduced a new type `kTypeValuePreferredSeqno` for entries added to the DB via the `TimedPut` API.
The life cycle of such an entry on the write/flush/compaction paths are:
1) It is initially added to memtable as:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, write_unix_time}`
2) When it's flushed to L0 sst files, it's converted to:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, preferred_seqno}`
when we have easy access to the seqno to time mapping.
3) During compaction, if certain conditions are met, we swap in the `preferred_seqno` and the entry will become:
`<user_key, preferred_seqno, kTypeValue>: value`. This step helps fast track these entries to the cold tier if they are eligible after the sequence number swap.
On the read path:
A `kTypeValuePreferredSeqno` entry acts the same as a `kTypeValue` entry, the unix_write_time/preferred seqno part packed in value is completely ignored.
Needed follow ups:
1) The seqno to time mapping accessible in flush needs to be extended to cover the `write_unix_time` for possible `kTypeValuePreferredSeqno` entries. This also means we need to track these `write_unix_time` in memtable.
2) Compaction filter support for the new `kTypeValuePreferredSeqno` type for feature parity with other `kTypeValue` and equivalent types.
3) Stress test coverage for the feature
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12419
Test Plan: Added unit tests
Reviewed By: pdillinger
Differential Revision: D54920296
Pulled By: jowlyzhang
fbshipit-source-id: c8b43f7a7c465e569141770e93c748371ff1da9e
2024-03-14 22:44:55 +00:00
|
|
|
Status WriteBatch::TimedPut(ColumnFamilyHandle* column_family, const Slice& key,
|
|
|
|
const Slice& value, uint64_t write_unix_time) {
|
|
|
|
size_t ts_sz = 0;
|
|
|
|
uint32_t cf_id = 0;
|
|
|
|
Status s;
|
|
|
|
|
|
|
|
std::tie(s, cf_id, ts_sz) =
|
|
|
|
WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this,
|
|
|
|
column_family);
|
|
|
|
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
} else if (ts_sz != 0) {
|
|
|
|
return Status::NotSupported(
|
|
|
|
"TimedPut is not supported in combination with user-defined "
|
|
|
|
"timestamps.");
|
|
|
|
}
|
|
|
|
return WriteBatchInternal::TimedPut(this, cf_id, key, value, write_unix_time);
|
|
|
|
}
|
|
|
|
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
Status WriteBatch::Put(ColumnFamilyHandle* column_family, const Slice& key,
|
|
|
|
const Slice& ts, const Slice& value) {
|
2024-08-05 20:06:45 +00:00
|
|
|
Status s = CheckColumnFamilyTimestampSize(column_family, ts);
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
2022-02-22 22:19:02 +00:00
|
|
|
has_key_with_ts_ = true;
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
assert(column_family);
|
|
|
|
uint32_t cf_id = column_family->GetID();
|
|
|
|
std::array<Slice, 2> key_with_ts{{key, ts}};
|
2024-08-05 20:06:45 +00:00
|
|
|
s = WriteBatchInternal::Put(this, cf_id, SliceParts(key_with_ts.data(), 2),
|
|
|
|
SliceParts(&value, 1));
|
|
|
|
if (s.ok()) {
|
|
|
|
MaybeTrackTimestampSize(cf_id, ts.size());
|
|
|
|
}
|
|
|
|
return s;
|
2014-04-22 18:27:33 +00:00
|
|
|
}
|
|
|
|
|
2018-02-09 22:50:09 +00:00
|
|
|
Status WriteBatchInternal::CheckSlicePartsLength(const SliceParts& key,
|
|
|
|
const SliceParts& value) {
|
|
|
|
size_t total_key_bytes = 0;
|
|
|
|
for (int i = 0; i < key.num_parts; ++i) {
|
|
|
|
total_key_bytes += key.parts[i].size();
|
|
|
|
}
|
2022-05-05 20:08:21 +00:00
|
|
|
if (total_key_bytes >= size_t{std::numeric_limits<uint32_t>::max()}) {
|
2018-02-09 22:50:09 +00:00
|
|
|
return Status::InvalidArgument("key is too large");
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t total_value_bytes = 0;
|
|
|
|
for (int i = 0; i < value.num_parts; ++i) {
|
|
|
|
total_value_bytes += value.parts[i].size();
|
|
|
|
}
|
2022-05-05 20:08:21 +00:00
|
|
|
if (total_value_bytes >= size_t{std::numeric_limits<uint32_t>::max()}) {
|
2018-02-09 22:50:09 +00:00
|
|
|
return Status::InvalidArgument("value is too large");
|
|
|
|
}
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
2017-04-10 22:38:34 +00:00
|
|
|
Status WriteBatchInternal::Put(WriteBatch* b, uint32_t column_family_id,
|
|
|
|
const SliceParts& key, const SliceParts& value) {
|
2018-02-09 22:50:09 +00:00
|
|
|
Status s = CheckSlicePartsLength(key, value);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2017-04-10 22:38:34 +00:00
|
|
|
LocalSavePoint save(b);
|
2014-04-22 18:27:33 +00:00
|
|
|
WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
|
2014-01-07 22:41:42 +00:00
|
|
|
if (column_family_id == 0) {
|
2014-04-22 18:27:33 +00:00
|
|
|
b->rep_.push_back(static_cast<char>(kTypeValue));
|
2014-01-07 22:41:42 +00:00
|
|
|
} else {
|
2014-04-22 18:27:33 +00:00
|
|
|
b->rep_.push_back(static_cast<char>(kTypeColumnFamilyValue));
|
|
|
|
PutVarint32(&b->rep_, column_family_id);
|
2014-01-07 22:41:42 +00:00
|
|
|
}
|
2021-09-12 22:33:15 +00:00
|
|
|
PutLengthPrefixedSliceParts(&b->rep_, key);
|
2014-04-22 18:27:33 +00:00
|
|
|
PutLengthPrefixedSliceParts(&b->rep_, value);
|
2015-11-06 15:03:30 +00:00
|
|
|
b->content_flags_.store(
|
|
|
|
b->content_flags_.load(std::memory_order_relaxed) | ContentFlags::HAS_PUT,
|
|
|
|
std::memory_order_relaxed);
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
if (b->prot_info_ != nullptr) {
|
|
|
|
// See comment in first `WriteBatchInternal::Put()` overload concerning the
|
2021-09-14 20:13:36 +00:00
|
|
|
// `ValueType` argument passed to `ProtectKVO()`.
|
|
|
|
b->prot_info_->entries_.emplace_back(ProtectionInfo64()
|
|
|
|
.ProtectKVO(key, value, kTypeValue)
|
|
|
|
.ProtectC(column_family_id));
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
}
|
2017-04-10 22:38:34 +00:00
|
|
|
return save.commit();
|
2011-03-18 22:37:00 +00:00
|
|
|
}
|
|
|
|
|
2017-04-10 22:38:34 +00:00
|
|
|
Status WriteBatch::Put(ColumnFamilyHandle* column_family, const SliceParts& key,
|
|
|
|
const SliceParts& value) {
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
size_t ts_sz = 0;
|
|
|
|
uint32_t cf_id = 0;
|
|
|
|
Status s;
|
|
|
|
|
|
|
|
std::tie(s, cf_id, ts_sz) =
|
|
|
|
WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this,
|
|
|
|
column_family);
|
|
|
|
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ts_sz == 0) {
|
2024-08-05 20:06:45 +00:00
|
|
|
s = WriteBatchInternal::Put(this, cf_id, key, value);
|
|
|
|
if (s.ok()) {
|
|
|
|
MaybeTrackTimestampSize(cf_id, ts_sz);
|
|
|
|
}
|
|
|
|
return s;
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"Cannot call this method on column family enabling timestamp");
|
2014-04-22 18:27:33 +00:00
|
|
|
}
|
2014-03-14 18:26:13 +00:00
|
|
|
|
2022-06-25 22:30:47 +00:00
|
|
|
Status WriteBatchInternal::PutEntity(WriteBatch* b, uint32_t column_family_id,
|
|
|
|
const Slice& key,
|
|
|
|
const WideColumns& columns) {
|
|
|
|
assert(b);
|
|
|
|
|
|
|
|
if (key.size() > size_t{std::numeric_limits<uint32_t>::max()}) {
|
|
|
|
return Status::InvalidArgument("key is too large");
|
|
|
|
}
|
|
|
|
|
|
|
|
WideColumns sorted_columns(columns);
|
2023-09-12 19:36:07 +00:00
|
|
|
WideColumnsHelper::SortColumns(sorted_columns);
|
2022-06-25 22:30:47 +00:00
|
|
|
|
|
|
|
std::string entity;
|
|
|
|
const Status s = WideColumnSerialization::Serialize(sorted_columns, entity);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (entity.size() > size_t{std::numeric_limits<uint32_t>::max()}) {
|
|
|
|
return Status::InvalidArgument("wide column entity is too large");
|
|
|
|
}
|
|
|
|
|
|
|
|
LocalSavePoint save(b);
|
|
|
|
|
|
|
|
WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
|
|
|
|
|
|
|
|
if (column_family_id == 0) {
|
|
|
|
b->rep_.push_back(static_cast<char>(kTypeWideColumnEntity));
|
|
|
|
} else {
|
|
|
|
b->rep_.push_back(static_cast<char>(kTypeColumnFamilyWideColumnEntity));
|
|
|
|
PutVarint32(&b->rep_, column_family_id);
|
|
|
|
}
|
|
|
|
|
|
|
|
PutLengthPrefixedSlice(&b->rep_, key);
|
|
|
|
PutLengthPrefixedSlice(&b->rep_, entity);
|
|
|
|
|
|
|
|
b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
|
|
|
|
ContentFlags::HAS_PUT_ENTITY,
|
|
|
|
std::memory_order_relaxed);
|
|
|
|
|
|
|
|
if (b->prot_info_ != nullptr) {
|
|
|
|
b->prot_info_->entries_.emplace_back(
|
|
|
|
ProtectionInfo64()
|
|
|
|
.ProtectKVO(key, entity, kTypeWideColumnEntity)
|
|
|
|
.ProtectC(column_family_id));
|
|
|
|
}
|
|
|
|
|
|
|
|
return save.commit();
|
|
|
|
}
|
|
|
|
|
|
|
|
Status WriteBatch::PutEntity(ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& key, const WideColumns& columns) {
|
|
|
|
if (!column_family) {
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"Cannot call this method without a column family handle");
|
|
|
|
}
|
|
|
|
|
|
|
|
Status s;
|
|
|
|
uint32_t cf_id = 0;
|
|
|
|
size_t ts_sz = 0;
|
|
|
|
|
|
|
|
std::tie(s, cf_id, ts_sz) =
|
|
|
|
WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this,
|
|
|
|
column_family);
|
|
|
|
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ts_sz) {
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"Cannot call this method on column family enabling timestamp");
|
|
|
|
}
|
|
|
|
|
|
|
|
return WriteBatchInternal::PutEntity(this, cf_id, key, columns);
|
|
|
|
}
|
|
|
|
|
2023-11-07 00:52:51 +00:00
|
|
|
Status WriteBatch::PutEntity(const Slice& key,
|
|
|
|
const AttributeGroups& attribute_groups) {
|
|
|
|
if (attribute_groups.empty()) {
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"Cannot call this method with empty attribute groups");
|
|
|
|
}
|
|
|
|
Status s;
|
|
|
|
for (const AttributeGroup& ag : attribute_groups) {
|
|
|
|
s = PutEntity(ag.column_family(), key, ag.columns());
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2017-04-10 22:38:34 +00:00
|
|
|
Status WriteBatchInternal::InsertNoop(WriteBatch* b) {
|
Modification of WriteBatch to support two phase commit
Summary: Adds three new WriteBatch data types: Prepare(xid), Commit(xid), Rollback(xid). Prepare(xid) should precede the (single) operation to which is applies. There can obviously be multiple Prepare(xid) markers. There should only be one Rollback(xid) or Commit(xid) marker yet not both. None of this logic is currently enforced and will most likely be implemented further up such as in the memtableinserter. All three markers are similar to PutLogData in that they are writebatch meta-data, ie stored but not counted. All three markers differ from PutLogData in that they will actually be written to disk. As for WriteBatchWithIndex, Prepare, Commit, Rollback are all implemented just as PutLogData and none are tested just as PutLogData.
Test Plan: single unit test in write_batch_test.
Reviewers: hermanlee4, sdong, anthony
Subscribers: leveldb, dhruba, vasilep, andrewkr
Differential Revision: https://reviews.facebook.net/D57867
2016-04-08 06:35:51 +00:00
|
|
|
b->rep_.push_back(static_cast<char>(kTypeNoop));
|
2017-04-10 22:38:34 +00:00
|
|
|
return Status::OK();
|
Modification of WriteBatch to support two phase commit
Summary: Adds three new WriteBatch data types: Prepare(xid), Commit(xid), Rollback(xid). Prepare(xid) should precede the (single) operation to which is applies. There can obviously be multiple Prepare(xid) markers. There should only be one Rollback(xid) or Commit(xid) marker yet not both. None of this logic is currently enforced and will most likely be implemented further up such as in the memtableinserter. All three markers are similar to PutLogData in that they are writebatch meta-data, ie stored but not counted. All three markers differ from PutLogData in that they will actually be written to disk. As for WriteBatchWithIndex, Prepare, Commit, Rollback are all implemented just as PutLogData and none are tested just as PutLogData.
Test Plan: single unit test in write_batch_test.
Reviewers: hermanlee4, sdong, anthony
Subscribers: leveldb, dhruba, vasilep, andrewkr
Differential Revision: https://reviews.facebook.net/D57867
2016-04-08 06:35:51 +00:00
|
|
|
}
|
|
|
|
|
2024-11-07 01:32:03 +00:00
|
|
|
ValueType WriteBatchInternal::GetBeginPrepareType(bool write_after_commit,
|
|
|
|
bool unprepared_batch) {
|
|
|
|
return write_after_commit
|
|
|
|
? kTypeBeginPrepareXID
|
|
|
|
: (unprepared_batch ? kTypeBeginUnprepareXID
|
|
|
|
: kTypeBeginPersistedPrepareXID);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status WriteBatchInternal::InsertBeginPrepare(WriteBatch* b,
|
|
|
|
bool write_after_commit,
|
|
|
|
bool unprepared_batch) {
|
|
|
|
b->rep_.push_back(static_cast<char>(
|
|
|
|
GetBeginPrepareType(write_after_commit, unprepared_batch)));
|
|
|
|
b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
|
|
|
|
ContentFlags::HAS_BEGIN_PREPARE,
|
|
|
|
std::memory_order_relaxed);
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
Status WriteBatchInternal::InsertEndPrepare(WriteBatch* b, const Slice& xid) {
|
|
|
|
b->rep_.push_back(static_cast<char>(kTypeEndPrepareXID));
|
|
|
|
PutLengthPrefixedSlice(&b->rep_, xid);
|
|
|
|
b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
|
|
|
|
ContentFlags::HAS_END_PREPARE,
|
|
|
|
std::memory_order_relaxed);
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
2017-11-11 19:23:43 +00:00
|
|
|
Status WriteBatchInternal::MarkEndPrepare(WriteBatch* b, const Slice& xid,
|
2018-06-29 01:46:39 +00:00
|
|
|
bool write_after_commit,
|
|
|
|
bool unprepared_batch) {
|
Modification of WriteBatch to support two phase commit
Summary: Adds three new WriteBatch data types: Prepare(xid), Commit(xid), Rollback(xid). Prepare(xid) should precede the (single) operation to which is applies. There can obviously be multiple Prepare(xid) markers. There should only be one Rollback(xid) or Commit(xid) marker yet not both. None of this logic is currently enforced and will most likely be implemented further up such as in the memtableinserter. All three markers are similar to PutLogData in that they are writebatch meta-data, ie stored but not counted. All three markers differ from PutLogData in that they will actually be written to disk. As for WriteBatchWithIndex, Prepare, Commit, Rollback are all implemented just as PutLogData and none are tested just as PutLogData.
Test Plan: single unit test in write_batch_test.
Reviewers: hermanlee4, sdong, anthony
Subscribers: leveldb, dhruba, vasilep, andrewkr
Differential Revision: https://reviews.facebook.net/D57867
2016-04-08 06:35:51 +00:00
|
|
|
// a manually constructed batch can only contain one prepare section
|
|
|
|
assert(b->rep_[12] == static_cast<char>(kTypeNoop));
|
|
|
|
|
|
|
|
// all savepoints up to this point are cleared
|
|
|
|
if (b->save_points_ != nullptr) {
|
|
|
|
while (!b->save_points_->stack.empty()) {
|
|
|
|
b->save_points_->stack.pop();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// rewrite noop as begin marker
|
2018-06-29 01:46:39 +00:00
|
|
|
b->rep_[12] = static_cast<char>(
|
2024-11-07 01:32:03 +00:00
|
|
|
GetBeginPrepareType(write_after_commit, unprepared_batch));
|
Modification of WriteBatch to support two phase commit
Summary: Adds three new WriteBatch data types: Prepare(xid), Commit(xid), Rollback(xid). Prepare(xid) should precede the (single) operation to which is applies. There can obviously be multiple Prepare(xid) markers. There should only be one Rollback(xid) or Commit(xid) marker yet not both. None of this logic is currently enforced and will most likely be implemented further up such as in the memtableinserter. All three markers are similar to PutLogData in that they are writebatch meta-data, ie stored but not counted. All three markers differ from PutLogData in that they will actually be written to disk. As for WriteBatchWithIndex, Prepare, Commit, Rollback are all implemented just as PutLogData and none are tested just as PutLogData.
Test Plan: single unit test in write_batch_test.
Reviewers: hermanlee4, sdong, anthony
Subscribers: leveldb, dhruba, vasilep, andrewkr
Differential Revision: https://reviews.facebook.net/D57867
2016-04-08 06:35:51 +00:00
|
|
|
b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
|
|
|
|
ContentFlags::HAS_BEGIN_PREPARE,
|
|
|
|
std::memory_order_relaxed);
|
2018-07-24 07:09:18 +00:00
|
|
|
if (unprepared_batch) {
|
|
|
|
b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
|
|
|
|
ContentFlags::HAS_BEGIN_UNPREPARE,
|
|
|
|
std::memory_order_relaxed);
|
|
|
|
}
|
2024-11-07 01:32:03 +00:00
|
|
|
return WriteBatchInternal::InsertEndPrepare(b, xid);
|
Modification of WriteBatch to support two phase commit
Summary: Adds three new WriteBatch data types: Prepare(xid), Commit(xid), Rollback(xid). Prepare(xid) should precede the (single) operation to which is applies. There can obviously be multiple Prepare(xid) markers. There should only be one Rollback(xid) or Commit(xid) marker yet not both. None of this logic is currently enforced and will most likely be implemented further up such as in the memtableinserter. All three markers are similar to PutLogData in that they are writebatch meta-data, ie stored but not counted. All three markers differ from PutLogData in that they will actually be written to disk. As for WriteBatchWithIndex, Prepare, Commit, Rollback are all implemented just as PutLogData and none are tested just as PutLogData.
Test Plan: single unit test in write_batch_test.
Reviewers: hermanlee4, sdong, anthony
Subscribers: leveldb, dhruba, vasilep, andrewkr
Differential Revision: https://reviews.facebook.net/D57867
2016-04-08 06:35:51 +00:00
|
|
|
}
|
|
|
|
|
2017-04-10 22:38:34 +00:00
|
|
|
Status WriteBatchInternal::MarkCommit(WriteBatch* b, const Slice& xid) {
|
Modification of WriteBatch to support two phase commit
Summary: Adds three new WriteBatch data types: Prepare(xid), Commit(xid), Rollback(xid). Prepare(xid) should precede the (single) operation to which is applies. There can obviously be multiple Prepare(xid) markers. There should only be one Rollback(xid) or Commit(xid) marker yet not both. None of this logic is currently enforced and will most likely be implemented further up such as in the memtableinserter. All three markers are similar to PutLogData in that they are writebatch meta-data, ie stored but not counted. All three markers differ from PutLogData in that they will actually be written to disk. As for WriteBatchWithIndex, Prepare, Commit, Rollback are all implemented just as PutLogData and none are tested just as PutLogData.
Test Plan: single unit test in write_batch_test.
Reviewers: hermanlee4, sdong, anthony
Subscribers: leveldb, dhruba, vasilep, andrewkr
Differential Revision: https://reviews.facebook.net/D57867
2016-04-08 06:35:51 +00:00
|
|
|
b->rep_.push_back(static_cast<char>(kTypeCommitXID));
|
|
|
|
PutLengthPrefixedSlice(&b->rep_, xid);
|
|
|
|
b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
|
|
|
|
ContentFlags::HAS_COMMIT,
|
|
|
|
std::memory_order_relaxed);
|
2017-04-10 22:38:34 +00:00
|
|
|
return Status::OK();
|
Modification of WriteBatch to support two phase commit
Summary: Adds three new WriteBatch data types: Prepare(xid), Commit(xid), Rollback(xid). Prepare(xid) should precede the (single) operation to which is applies. There can obviously be multiple Prepare(xid) markers. There should only be one Rollback(xid) or Commit(xid) marker yet not both. None of this logic is currently enforced and will most likely be implemented further up such as in the memtableinserter. All three markers are similar to PutLogData in that they are writebatch meta-data, ie stored but not counted. All three markers differ from PutLogData in that they will actually be written to disk. As for WriteBatchWithIndex, Prepare, Commit, Rollback are all implemented just as PutLogData and none are tested just as PutLogData.
Test Plan: single unit test in write_batch_test.
Reviewers: hermanlee4, sdong, anthony
Subscribers: leveldb, dhruba, vasilep, andrewkr
Differential Revision: https://reviews.facebook.net/D57867
2016-04-08 06:35:51 +00:00
|
|
|
}
|
|
|
|
|
2021-12-10 19:03:39 +00:00
|
|
|
Status WriteBatchInternal::MarkCommitWithTimestamp(WriteBatch* b,
|
|
|
|
const Slice& xid,
|
|
|
|
const Slice& commit_ts) {
|
|
|
|
assert(!commit_ts.empty());
|
|
|
|
b->rep_.push_back(static_cast<char>(kTypeCommitXIDAndTimestamp));
|
|
|
|
PutLengthPrefixedSlice(&b->rep_, commit_ts);
|
|
|
|
PutLengthPrefixedSlice(&b->rep_, xid);
|
|
|
|
b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
|
|
|
|
ContentFlags::HAS_COMMIT,
|
|
|
|
std::memory_order_relaxed);
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
2017-04-10 22:38:34 +00:00
|
|
|
Status WriteBatchInternal::MarkRollback(WriteBatch* b, const Slice& xid) {
|
Modification of WriteBatch to support two phase commit
Summary: Adds three new WriteBatch data types: Prepare(xid), Commit(xid), Rollback(xid). Prepare(xid) should precede the (single) operation to which is applies. There can obviously be multiple Prepare(xid) markers. There should only be one Rollback(xid) or Commit(xid) marker yet not both. None of this logic is currently enforced and will most likely be implemented further up such as in the memtableinserter. All three markers are similar to PutLogData in that they are writebatch meta-data, ie stored but not counted. All three markers differ from PutLogData in that they will actually be written to disk. As for WriteBatchWithIndex, Prepare, Commit, Rollback are all implemented just as PutLogData and none are tested just as PutLogData.
Test Plan: single unit test in write_batch_test.
Reviewers: hermanlee4, sdong, anthony
Subscribers: leveldb, dhruba, vasilep, andrewkr
Differential Revision: https://reviews.facebook.net/D57867
2016-04-08 06:35:51 +00:00
|
|
|
b->rep_.push_back(static_cast<char>(kTypeRollbackXID));
|
|
|
|
PutLengthPrefixedSlice(&b->rep_, xid);
|
|
|
|
b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
|
|
|
|
ContentFlags::HAS_ROLLBACK,
|
|
|
|
std::memory_order_relaxed);
|
2017-04-10 22:38:34 +00:00
|
|
|
return Status::OK();
|
Modification of WriteBatch to support two phase commit
Summary: Adds three new WriteBatch data types: Prepare(xid), Commit(xid), Rollback(xid). Prepare(xid) should precede the (single) operation to which is applies. There can obviously be multiple Prepare(xid) markers. There should only be one Rollback(xid) or Commit(xid) marker yet not both. None of this logic is currently enforced and will most likely be implemented further up such as in the memtableinserter. All three markers are similar to PutLogData in that they are writebatch meta-data, ie stored but not counted. All three markers differ from PutLogData in that they will actually be written to disk. As for WriteBatchWithIndex, Prepare, Commit, Rollback are all implemented just as PutLogData and none are tested just as PutLogData.
Test Plan: single unit test in write_batch_test.
Reviewers: hermanlee4, sdong, anthony
Subscribers: leveldb, dhruba, vasilep, andrewkr
Differential Revision: https://reviews.facebook.net/D57867
2016-04-08 06:35:51 +00:00
|
|
|
}
|
|
|
|
|
2017-04-10 22:38:34 +00:00
|
|
|
Status WriteBatchInternal::Delete(WriteBatch* b, uint32_t column_family_id,
|
|
|
|
const Slice& key) {
|
|
|
|
LocalSavePoint save(b);
|
2014-04-22 18:27:33 +00:00
|
|
|
WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
|
2014-01-07 22:41:42 +00:00
|
|
|
if (column_family_id == 0) {
|
2014-04-22 18:27:33 +00:00
|
|
|
b->rep_.push_back(static_cast<char>(kTypeDeletion));
|
2014-01-07 22:41:42 +00:00
|
|
|
} else {
|
2014-04-22 18:27:33 +00:00
|
|
|
b->rep_.push_back(static_cast<char>(kTypeColumnFamilyDeletion));
|
|
|
|
PutVarint32(&b->rep_, column_family_id);
|
2014-01-07 22:41:42 +00:00
|
|
|
}
|
2021-09-12 22:33:15 +00:00
|
|
|
PutLengthPrefixedSlice(&b->rep_, key);
|
2015-11-06 15:03:30 +00:00
|
|
|
b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
|
|
|
|
ContentFlags::HAS_DELETE,
|
|
|
|
std::memory_order_relaxed);
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
if (b->prot_info_ != nullptr) {
|
|
|
|
// See comment in first `WriteBatchInternal::Put()` overload concerning the
|
2021-09-14 20:13:36 +00:00
|
|
|
// `ValueType` argument passed to `ProtectKVO()`.
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
b->prot_info_->entries_.emplace_back(
|
|
|
|
ProtectionInfo64()
|
2021-09-14 20:13:36 +00:00
|
|
|
.ProtectKVO(key, "" /* value */, kTypeDeletion)
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
.ProtectC(column_family_id));
|
|
|
|
}
|
2017-04-10 22:38:34 +00:00
|
|
|
return save.commit();
|
2013-11-07 20:37:58 +00:00
|
|
|
}
|
|
|
|
|
2017-04-10 22:38:34 +00:00
|
|
|
Status WriteBatch::Delete(ColumnFamilyHandle* column_family, const Slice& key) {
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
size_t ts_sz = 0;
|
|
|
|
uint32_t cf_id = 0;
|
|
|
|
Status s;
|
|
|
|
|
|
|
|
std::tie(s, cf_id, ts_sz) =
|
|
|
|
WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this,
|
|
|
|
column_family);
|
|
|
|
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (0 == ts_sz) {
|
2024-08-05 20:06:45 +00:00
|
|
|
s = WriteBatchInternal::Delete(this, cf_id, key);
|
|
|
|
} else {
|
|
|
|
needs_in_place_update_ts_ = true;
|
|
|
|
has_key_with_ts_ = true;
|
|
|
|
std::string dummy_ts(ts_sz, '\0');
|
|
|
|
std::array<Slice, 2> key_with_ts{{key, dummy_ts}};
|
|
|
|
s = WriteBatchInternal::Delete(this, cf_id,
|
|
|
|
SliceParts(key_with_ts.data(), 2));
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
}
|
2024-08-05 20:06:45 +00:00
|
|
|
if (s.ok()) {
|
|
|
|
MaybeTrackTimestampSize(cf_id, ts_sz);
|
|
|
|
}
|
|
|
|
return s;
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
Status WriteBatch::Delete(ColumnFamilyHandle* column_family, const Slice& key,
|
|
|
|
const Slice& ts) {
|
2024-08-05 20:06:45 +00:00
|
|
|
Status s = CheckColumnFamilyTimestampSize(column_family, ts);
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
assert(column_family);
|
2022-02-22 22:19:02 +00:00
|
|
|
has_key_with_ts_ = true;
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
uint32_t cf_id = column_family->GetID();
|
|
|
|
std::array<Slice, 2> key_with_ts{{key, ts}};
|
2024-08-05 20:06:45 +00:00
|
|
|
s = WriteBatchInternal::Delete(this, cf_id,
|
|
|
|
SliceParts(key_with_ts.data(), 2));
|
|
|
|
if (s.ok()) {
|
|
|
|
MaybeTrackTimestampSize(cf_id, ts.size());
|
|
|
|
}
|
|
|
|
return s;
|
2014-07-10 16:31:42 +00:00
|
|
|
}
|
|
|
|
|
2017-04-10 22:38:34 +00:00
|
|
|
Status WriteBatchInternal::Delete(WriteBatch* b, uint32_t column_family_id,
|
|
|
|
const SliceParts& key) {
|
|
|
|
LocalSavePoint save(b);
|
2014-07-10 16:31:42 +00:00
|
|
|
WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
|
|
|
|
if (column_family_id == 0) {
|
|
|
|
b->rep_.push_back(static_cast<char>(kTypeDeletion));
|
|
|
|
} else {
|
|
|
|
b->rep_.push_back(static_cast<char>(kTypeColumnFamilyDeletion));
|
|
|
|
PutVarint32(&b->rep_, column_family_id);
|
|
|
|
}
|
2021-09-12 22:33:15 +00:00
|
|
|
PutLengthPrefixedSliceParts(&b->rep_, key);
|
2015-11-06 15:03:30 +00:00
|
|
|
b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
|
|
|
|
ContentFlags::HAS_DELETE,
|
|
|
|
std::memory_order_relaxed);
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
if (b->prot_info_ != nullptr) {
|
|
|
|
// See comment in first `WriteBatchInternal::Put()` overload concerning the
|
2021-09-14 20:13:36 +00:00
|
|
|
// `ValueType` argument passed to `ProtectKVO()`.
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
b->prot_info_->entries_.emplace_back(
|
|
|
|
ProtectionInfo64()
|
2021-09-14 20:13:36 +00:00
|
|
|
.ProtectKVO(key,
|
|
|
|
SliceParts(nullptr /* _parts */, 0 /* _num_parts */),
|
|
|
|
kTypeDeletion)
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
.ProtectC(column_family_id));
|
|
|
|
}
|
2017-04-10 22:38:34 +00:00
|
|
|
return save.commit();
|
2014-07-10 16:31:42 +00:00
|
|
|
}
|
|
|
|
|
2017-04-10 22:38:34 +00:00
|
|
|
Status WriteBatch::Delete(ColumnFamilyHandle* column_family,
|
|
|
|
const SliceParts& key) {
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
size_t ts_sz = 0;
|
|
|
|
uint32_t cf_id = 0;
|
|
|
|
Status s;
|
|
|
|
|
|
|
|
std::tie(s, cf_id, ts_sz) =
|
|
|
|
WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this,
|
|
|
|
column_family);
|
|
|
|
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (0 == ts_sz) {
|
2024-08-05 20:06:45 +00:00
|
|
|
s = WriteBatchInternal::Delete(this, cf_id, key);
|
|
|
|
if (s.ok()) {
|
|
|
|
MaybeTrackTimestampSize(cf_id, ts_sz);
|
|
|
|
}
|
|
|
|
return s;
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"Cannot call this method on column family enabling timestamp");
|
2014-04-22 18:27:33 +00:00
|
|
|
}
|
2014-03-14 18:26:13 +00:00
|
|
|
|
2017-04-10 22:38:34 +00:00
|
|
|
Status WriteBatchInternal::SingleDelete(WriteBatch* b,
|
|
|
|
uint32_t column_family_id,
|
|
|
|
const Slice& key) {
|
|
|
|
LocalSavePoint save(b);
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 18:42:56 +00:00
|
|
|
WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
|
|
|
|
if (column_family_id == 0) {
|
|
|
|
b->rep_.push_back(static_cast<char>(kTypeSingleDeletion));
|
|
|
|
} else {
|
|
|
|
b->rep_.push_back(static_cast<char>(kTypeColumnFamilySingleDeletion));
|
|
|
|
PutVarint32(&b->rep_, column_family_id);
|
|
|
|
}
|
|
|
|
PutLengthPrefixedSlice(&b->rep_, key);
|
2015-11-06 15:03:30 +00:00
|
|
|
b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
|
|
|
|
ContentFlags::HAS_SINGLE_DELETE,
|
|
|
|
std::memory_order_relaxed);
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
if (b->prot_info_ != nullptr) {
|
|
|
|
// See comment in first `WriteBatchInternal::Put()` overload concerning the
|
2021-09-14 20:13:36 +00:00
|
|
|
// `ValueType` argument passed to `ProtectKVO()`.
|
2021-09-12 22:33:15 +00:00
|
|
|
b->prot_info_->entries_.emplace_back(
|
|
|
|
ProtectionInfo64()
|
2021-09-14 20:13:36 +00:00
|
|
|
.ProtectKVO(key, "" /* value */, kTypeSingleDeletion)
|
2021-09-12 22:33:15 +00:00
|
|
|
.ProtectC(column_family_id));
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
}
|
2017-04-10 22:38:34 +00:00
|
|
|
return save.commit();
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 18:42:56 +00:00
|
|
|
}
|
|
|
|
|
2017-04-10 22:38:34 +00:00
|
|
|
Status WriteBatch::SingleDelete(ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& key) {
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
size_t ts_sz = 0;
|
|
|
|
uint32_t cf_id = 0;
|
|
|
|
Status s;
|
|
|
|
|
|
|
|
std::tie(s, cf_id, ts_sz) =
|
|
|
|
WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this,
|
|
|
|
column_family);
|
|
|
|
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (0 == ts_sz) {
|
2024-08-05 20:06:45 +00:00
|
|
|
s = WriteBatchInternal::SingleDelete(this, cf_id, key);
|
|
|
|
} else {
|
|
|
|
needs_in_place_update_ts_ = true;
|
|
|
|
has_key_with_ts_ = true;
|
|
|
|
std::string dummy_ts(ts_sz, '\0');
|
|
|
|
std::array<Slice, 2> key_with_ts{{key, dummy_ts}};
|
|
|
|
s = WriteBatchInternal::SingleDelete(this, cf_id,
|
|
|
|
SliceParts(key_with_ts.data(), 2));
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
}
|
2024-08-05 20:06:45 +00:00
|
|
|
if (s.ok()) {
|
|
|
|
MaybeTrackTimestampSize(cf_id, ts_sz);
|
|
|
|
}
|
|
|
|
return s;
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
Status WriteBatch::SingleDelete(ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& key, const Slice& ts) {
|
2024-08-05 20:06:45 +00:00
|
|
|
Status s = CheckColumnFamilyTimestampSize(column_family, ts);
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
2022-02-22 22:19:02 +00:00
|
|
|
has_key_with_ts_ = true;
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
assert(column_family);
|
|
|
|
uint32_t cf_id = column_family->GetID();
|
|
|
|
std::array<Slice, 2> key_with_ts{{key, ts}};
|
2024-08-05 20:06:45 +00:00
|
|
|
s = WriteBatchInternal::SingleDelete(this, cf_id,
|
|
|
|
SliceParts(key_with_ts.data(), 2));
|
|
|
|
if (s.ok()) {
|
|
|
|
MaybeTrackTimestampSize(cf_id, ts.size());
|
|
|
|
}
|
|
|
|
return s;
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 18:42:56 +00:00
|
|
|
}
|
|
|
|
|
2017-04-10 22:38:34 +00:00
|
|
|
Status WriteBatchInternal::SingleDelete(WriteBatch* b,
|
|
|
|
uint32_t column_family_id,
|
|
|
|
const SliceParts& key) {
|
|
|
|
LocalSavePoint save(b);
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 18:42:56 +00:00
|
|
|
WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
|
|
|
|
if (column_family_id == 0) {
|
|
|
|
b->rep_.push_back(static_cast<char>(kTypeSingleDeletion));
|
|
|
|
} else {
|
|
|
|
b->rep_.push_back(static_cast<char>(kTypeColumnFamilySingleDeletion));
|
|
|
|
PutVarint32(&b->rep_, column_family_id);
|
|
|
|
}
|
|
|
|
PutLengthPrefixedSliceParts(&b->rep_, key);
|
2015-11-06 15:03:30 +00:00
|
|
|
b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
|
|
|
|
ContentFlags::HAS_SINGLE_DELETE,
|
|
|
|
std::memory_order_relaxed);
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
if (b->prot_info_ != nullptr) {
|
|
|
|
// See comment in first `WriteBatchInternal::Put()` overload concerning the
|
2021-09-14 20:13:36 +00:00
|
|
|
// `ValueType` argument passed to `ProtectKVO()`.
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
b->prot_info_->entries_.emplace_back(
|
|
|
|
ProtectionInfo64()
|
2021-09-14 20:13:36 +00:00
|
|
|
.ProtectKVO(key,
|
|
|
|
SliceParts(nullptr /* _parts */,
|
|
|
|
0 /* _num_parts */) /* value */,
|
|
|
|
kTypeSingleDeletion)
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
.ProtectC(column_family_id));
|
|
|
|
}
|
2017-04-10 22:38:34 +00:00
|
|
|
return save.commit();
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 18:42:56 +00:00
|
|
|
}
|
|
|
|
|
2017-04-10 22:38:34 +00:00
|
|
|
Status WriteBatch::SingleDelete(ColumnFamilyHandle* column_family,
|
|
|
|
const SliceParts& key) {
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
size_t ts_sz = 0;
|
|
|
|
uint32_t cf_id = 0;
|
|
|
|
Status s;
|
|
|
|
|
|
|
|
std::tie(s, cf_id, ts_sz) =
|
|
|
|
WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this,
|
|
|
|
column_family);
|
|
|
|
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (0 == ts_sz) {
|
2024-08-05 20:06:45 +00:00
|
|
|
s = WriteBatchInternal::SingleDelete(this, cf_id, key);
|
|
|
|
if (s.ok()) {
|
|
|
|
MaybeTrackTimestampSize(cf_id, ts_sz);
|
|
|
|
}
|
|
|
|
return s;
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"Cannot call this method on column family enabling timestamp");
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 18:42:56 +00:00
|
|
|
}
|
|
|
|
|
2017-04-10 22:38:34 +00:00
|
|
|
Status WriteBatchInternal::DeleteRange(WriteBatch* b, uint32_t column_family_id,
|
|
|
|
const Slice& begin_key,
|
|
|
|
const Slice& end_key) {
|
|
|
|
LocalSavePoint save(b);
|
2016-08-16 15:16:04 +00:00
|
|
|
WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
|
|
|
|
if (column_family_id == 0) {
|
|
|
|
b->rep_.push_back(static_cast<char>(kTypeRangeDeletion));
|
|
|
|
} else {
|
|
|
|
b->rep_.push_back(static_cast<char>(kTypeColumnFamilyRangeDeletion));
|
|
|
|
PutVarint32(&b->rep_, column_family_id);
|
|
|
|
}
|
|
|
|
PutLengthPrefixedSlice(&b->rep_, begin_key);
|
|
|
|
PutLengthPrefixedSlice(&b->rep_, end_key);
|
|
|
|
b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
|
|
|
|
ContentFlags::HAS_DELETE_RANGE,
|
|
|
|
std::memory_order_relaxed);
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
if (b->prot_info_ != nullptr) {
|
|
|
|
// See comment in first `WriteBatchInternal::Put()` overload concerning the
|
2021-09-14 20:13:36 +00:00
|
|
|
// `ValueType` argument passed to `ProtectKVO()`.
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
// In `DeleteRange()`, the end key is treated as the value.
|
2021-09-12 22:33:15 +00:00
|
|
|
b->prot_info_->entries_.emplace_back(
|
|
|
|
ProtectionInfo64()
|
2021-09-14 20:13:36 +00:00
|
|
|
.ProtectKVO(begin_key, end_key, kTypeRangeDeletion)
|
2021-09-12 22:33:15 +00:00
|
|
|
.ProtectC(column_family_id));
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
}
|
2017-04-10 22:38:34 +00:00
|
|
|
return save.commit();
|
2016-08-16 15:16:04 +00:00
|
|
|
}
|
|
|
|
|
2017-04-10 22:38:34 +00:00
|
|
|
Status WriteBatch::DeleteRange(ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& begin_key, const Slice& end_key) {
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
size_t ts_sz = 0;
|
|
|
|
uint32_t cf_id = 0;
|
|
|
|
Status s;
|
|
|
|
|
|
|
|
std::tie(s, cf_id, ts_sz) =
|
|
|
|
WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this,
|
|
|
|
column_family);
|
|
|
|
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (0 == ts_sz) {
|
2024-08-05 20:06:45 +00:00
|
|
|
s = WriteBatchInternal::DeleteRange(this, cf_id, begin_key, end_key);
|
|
|
|
} else {
|
|
|
|
needs_in_place_update_ts_ = true;
|
|
|
|
has_key_with_ts_ = true;
|
|
|
|
std::string dummy_ts(ts_sz, '\0');
|
|
|
|
std::array<Slice, 2> begin_key_with_ts{{begin_key, dummy_ts}};
|
|
|
|
std::array<Slice, 2> end_key_with_ts{{end_key, dummy_ts}};
|
|
|
|
s = WriteBatchInternal::DeleteRange(this, cf_id,
|
|
|
|
SliceParts(begin_key_with_ts.data(), 2),
|
|
|
|
SliceParts(end_key_with_ts.data(), 2));
|
|
|
|
}
|
|
|
|
if (s.ok()) {
|
|
|
|
MaybeTrackTimestampSize(cf_id, ts_sz);
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
}
|
2024-08-05 20:06:45 +00:00
|
|
|
return s;
|
2022-09-30 23:13:03 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
Status WriteBatch::DeleteRange(ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& begin_key, const Slice& end_key,
|
|
|
|
const Slice& ts) {
|
2024-08-05 20:06:45 +00:00
|
|
|
Status s = CheckColumnFamilyTimestampSize(column_family, ts);
|
2022-09-30 23:13:03 +00:00
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
assert(column_family);
|
|
|
|
has_key_with_ts_ = true;
|
|
|
|
uint32_t cf_id = column_family->GetID();
|
|
|
|
std::array<Slice, 2> key_with_ts{{begin_key, ts}};
|
|
|
|
std::array<Slice, 2> end_key_with_ts{{end_key, ts}};
|
2024-08-05 20:06:45 +00:00
|
|
|
s = WriteBatchInternal::DeleteRange(this, cf_id,
|
|
|
|
SliceParts(key_with_ts.data(), 2),
|
|
|
|
SliceParts(end_key_with_ts.data(), 2));
|
|
|
|
if (s.ok()) {
|
|
|
|
MaybeTrackTimestampSize(cf_id, ts.size());
|
|
|
|
}
|
|
|
|
return s;
|
2016-08-16 15:16:04 +00:00
|
|
|
}
|
|
|
|
|
2017-04-10 22:38:34 +00:00
|
|
|
Status WriteBatchInternal::DeleteRange(WriteBatch* b, uint32_t column_family_id,
|
|
|
|
const SliceParts& begin_key,
|
|
|
|
const SliceParts& end_key) {
|
|
|
|
LocalSavePoint save(b);
|
2016-08-16 15:16:04 +00:00
|
|
|
WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
|
|
|
|
if (column_family_id == 0) {
|
|
|
|
b->rep_.push_back(static_cast<char>(kTypeRangeDeletion));
|
|
|
|
} else {
|
|
|
|
b->rep_.push_back(static_cast<char>(kTypeColumnFamilyRangeDeletion));
|
|
|
|
PutVarint32(&b->rep_, column_family_id);
|
|
|
|
}
|
|
|
|
PutLengthPrefixedSliceParts(&b->rep_, begin_key);
|
|
|
|
PutLengthPrefixedSliceParts(&b->rep_, end_key);
|
|
|
|
b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
|
|
|
|
ContentFlags::HAS_DELETE_RANGE,
|
|
|
|
std::memory_order_relaxed);
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
if (b->prot_info_ != nullptr) {
|
|
|
|
// See comment in first `WriteBatchInternal::Put()` overload concerning the
|
2021-09-14 20:13:36 +00:00
|
|
|
// `ValueType` argument passed to `ProtectKVO()`.
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
// In `DeleteRange()`, the end key is treated as the value.
|
2021-09-12 22:33:15 +00:00
|
|
|
b->prot_info_->entries_.emplace_back(
|
|
|
|
ProtectionInfo64()
|
2021-09-14 20:13:36 +00:00
|
|
|
.ProtectKVO(begin_key, end_key, kTypeRangeDeletion)
|
2021-09-12 22:33:15 +00:00
|
|
|
.ProtectC(column_family_id));
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
}
|
2017-04-10 22:38:34 +00:00
|
|
|
return save.commit();
|
2016-08-16 15:16:04 +00:00
|
|
|
}
|
|
|
|
|
2017-04-10 22:38:34 +00:00
|
|
|
Status WriteBatch::DeleteRange(ColumnFamilyHandle* column_family,
|
|
|
|
const SliceParts& begin_key,
|
|
|
|
const SliceParts& end_key) {
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
size_t ts_sz = 0;
|
|
|
|
uint32_t cf_id = 0;
|
|
|
|
Status s;
|
|
|
|
|
|
|
|
std::tie(s, cf_id, ts_sz) =
|
|
|
|
WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this,
|
|
|
|
column_family);
|
|
|
|
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (0 == ts_sz) {
|
2024-08-05 20:06:45 +00:00
|
|
|
s = WriteBatchInternal::DeleteRange(this, cf_id, begin_key, end_key);
|
|
|
|
if (s.ok()) {
|
|
|
|
MaybeTrackTimestampSize(cf_id, ts_sz);
|
|
|
|
}
|
|
|
|
return s;
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"Cannot call this method on column family enabling timestamp");
|
2016-08-16 15:16:04 +00:00
|
|
|
}
|
|
|
|
|
2017-04-10 22:38:34 +00:00
|
|
|
Status WriteBatchInternal::Merge(WriteBatch* b, uint32_t column_family_id,
|
|
|
|
const Slice& key, const Slice& value) {
|
2022-05-05 20:08:21 +00:00
|
|
|
if (key.size() > size_t{std::numeric_limits<uint32_t>::max()}) {
|
2018-02-09 22:50:09 +00:00
|
|
|
return Status::InvalidArgument("key is too large");
|
|
|
|
}
|
2022-05-05 20:08:21 +00:00
|
|
|
if (value.size() > size_t{std::numeric_limits<uint32_t>::max()}) {
|
2018-02-09 22:50:09 +00:00
|
|
|
return Status::InvalidArgument("value is too large");
|
|
|
|
}
|
|
|
|
|
2017-04-10 22:38:34 +00:00
|
|
|
LocalSavePoint save(b);
|
2014-04-22 18:27:33 +00:00
|
|
|
WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
|
2014-01-07 22:41:42 +00:00
|
|
|
if (column_family_id == 0) {
|
2014-04-22 18:27:33 +00:00
|
|
|
b->rep_.push_back(static_cast<char>(kTypeMerge));
|
2014-01-07 22:41:42 +00:00
|
|
|
} else {
|
2014-04-22 18:27:33 +00:00
|
|
|
b->rep_.push_back(static_cast<char>(kTypeColumnFamilyMerge));
|
|
|
|
PutVarint32(&b->rep_, column_family_id);
|
2014-01-07 22:41:42 +00:00
|
|
|
}
|
2014-04-22 18:27:33 +00:00
|
|
|
PutLengthPrefixedSlice(&b->rep_, key);
|
|
|
|
PutLengthPrefixedSlice(&b->rep_, value);
|
2015-11-06 15:03:30 +00:00
|
|
|
b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
|
|
|
|
ContentFlags::HAS_MERGE,
|
|
|
|
std::memory_order_relaxed);
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
if (b->prot_info_ != nullptr) {
|
|
|
|
// See comment in first `WriteBatchInternal::Put()` overload concerning the
|
2021-09-14 20:13:36 +00:00
|
|
|
// `ValueType` argument passed to `ProtectKVO()`.
|
|
|
|
b->prot_info_->entries_.emplace_back(ProtectionInfo64()
|
|
|
|
.ProtectKVO(key, value, kTypeMerge)
|
|
|
|
.ProtectC(column_family_id));
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
}
|
2017-04-10 22:38:34 +00:00
|
|
|
return save.commit();
|
2011-03-18 22:37:00 +00:00
|
|
|
}
|
|
|
|
|
2017-04-10 22:38:34 +00:00
|
|
|
Status WriteBatch::Merge(ColumnFamilyHandle* column_family, const Slice& key,
|
|
|
|
const Slice& value) {
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
size_t ts_sz = 0;
|
|
|
|
uint32_t cf_id = 0;
|
|
|
|
Status s;
|
|
|
|
|
|
|
|
std::tie(s, cf_id, ts_sz) =
|
|
|
|
WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this,
|
|
|
|
column_family);
|
|
|
|
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (0 == ts_sz) {
|
2024-08-05 20:06:45 +00:00
|
|
|
s = WriteBatchInternal::Merge(this, cf_id, key, value);
|
|
|
|
} else {
|
|
|
|
needs_in_place_update_ts_ = true;
|
|
|
|
has_key_with_ts_ = true;
|
|
|
|
std::string dummy_ts(ts_sz, '\0');
|
|
|
|
std::array<Slice, 2> key_with_ts{{key, dummy_ts}};
|
2022-11-01 05:28:58 +00:00
|
|
|
|
2024-08-05 20:06:45 +00:00
|
|
|
s = WriteBatchInternal::Merge(
|
|
|
|
this, cf_id, SliceParts(key_with_ts.data(), 2), SliceParts(&value, 1));
|
|
|
|
}
|
|
|
|
if (s.ok()) {
|
|
|
|
MaybeTrackTimestampSize(cf_id, ts_sz);
|
|
|
|
}
|
|
|
|
return s;
|
2022-11-01 05:28:58 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
Status WriteBatch::Merge(ColumnFamilyHandle* column_family, const Slice& key,
|
|
|
|
const Slice& ts, const Slice& value) {
|
2024-08-05 20:06:45 +00:00
|
|
|
Status s = CheckColumnFamilyTimestampSize(column_family, ts);
|
2022-11-01 05:28:58 +00:00
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
has_key_with_ts_ = true;
|
|
|
|
assert(column_family);
|
|
|
|
uint32_t cf_id = column_family->GetID();
|
|
|
|
std::array<Slice, 2> key_with_ts{{key, ts}};
|
2024-08-05 20:06:45 +00:00
|
|
|
s = WriteBatchInternal::Merge(this, cf_id, SliceParts(key_with_ts.data(), 2),
|
|
|
|
SliceParts(&value, 1));
|
|
|
|
if (s.ok()) {
|
|
|
|
MaybeTrackTimestampSize(cf_id, ts.size());
|
|
|
|
}
|
|
|
|
return s;
|
2013-03-21 22:59:47 +00:00
|
|
|
}
|
|
|
|
|
2017-04-10 22:38:34 +00:00
|
|
|
Status WriteBatchInternal::Merge(WriteBatch* b, uint32_t column_family_id,
|
|
|
|
const SliceParts& key,
|
|
|
|
const SliceParts& value) {
|
2018-02-09 22:50:09 +00:00
|
|
|
Status s = CheckSlicePartsLength(key, value);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2017-04-10 22:38:34 +00:00
|
|
|
LocalSavePoint save(b);
|
2015-05-27 23:59:22 +00:00
|
|
|
WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
|
|
|
|
if (column_family_id == 0) {
|
|
|
|
b->rep_.push_back(static_cast<char>(kTypeMerge));
|
|
|
|
} else {
|
|
|
|
b->rep_.push_back(static_cast<char>(kTypeColumnFamilyMerge));
|
|
|
|
PutVarint32(&b->rep_, column_family_id);
|
|
|
|
}
|
|
|
|
PutLengthPrefixedSliceParts(&b->rep_, key);
|
|
|
|
PutLengthPrefixedSliceParts(&b->rep_, value);
|
2015-11-06 15:03:30 +00:00
|
|
|
b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
|
|
|
|
ContentFlags::HAS_MERGE,
|
|
|
|
std::memory_order_relaxed);
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
if (b->prot_info_ != nullptr) {
|
|
|
|
// See comment in first `WriteBatchInternal::Put()` overload concerning the
|
2021-09-14 20:13:36 +00:00
|
|
|
// `ValueType` argument passed to `ProtectKVO()`.
|
|
|
|
b->prot_info_->entries_.emplace_back(ProtectionInfo64()
|
|
|
|
.ProtectKVO(key, value, kTypeMerge)
|
|
|
|
.ProtectC(column_family_id));
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
}
|
2017-04-10 22:38:34 +00:00
|
|
|
return save.commit();
|
2015-05-27 23:59:22 +00:00
|
|
|
}
|
|
|
|
|
2017-04-10 22:38:34 +00:00
|
|
|
Status WriteBatch::Merge(ColumnFamilyHandle* column_family,
|
|
|
|
const SliceParts& key, const SliceParts& value) {
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
size_t ts_sz = 0;
|
|
|
|
uint32_t cf_id = 0;
|
|
|
|
Status s;
|
|
|
|
|
|
|
|
std::tie(s, cf_id, ts_sz) =
|
|
|
|
WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this,
|
|
|
|
column_family);
|
|
|
|
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (0 == ts_sz) {
|
2024-08-05 20:06:45 +00:00
|
|
|
s = WriteBatchInternal::Merge(this, cf_id, key, value);
|
|
|
|
if (s.ok()) {
|
|
|
|
MaybeTrackTimestampSize(cf_id, ts_sz);
|
|
|
|
}
|
|
|
|
return s;
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"Cannot call this method on column family enabling timestamp");
|
2015-05-27 23:59:22 +00:00
|
|
|
}
|
|
|
|
|
2017-10-03 16:08:07 +00:00
|
|
|
Status WriteBatchInternal::PutBlobIndex(WriteBatch* b,
|
|
|
|
uint32_t column_family_id,
|
|
|
|
const Slice& key, const Slice& value) {
|
|
|
|
LocalSavePoint save(b);
|
|
|
|
WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
|
|
|
|
if (column_family_id == 0) {
|
|
|
|
b->rep_.push_back(static_cast<char>(kTypeBlobIndex));
|
|
|
|
} else {
|
|
|
|
b->rep_.push_back(static_cast<char>(kTypeColumnFamilyBlobIndex));
|
|
|
|
PutVarint32(&b->rep_, column_family_id);
|
|
|
|
}
|
|
|
|
PutLengthPrefixedSlice(&b->rep_, key);
|
|
|
|
PutLengthPrefixedSlice(&b->rep_, value);
|
|
|
|
b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
|
|
|
|
ContentFlags::HAS_BLOB_INDEX,
|
|
|
|
std::memory_order_relaxed);
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
if (b->prot_info_ != nullptr) {
|
|
|
|
// See comment in first `WriteBatchInternal::Put()` overload concerning the
|
2021-09-14 20:13:36 +00:00
|
|
|
// `ValueType` argument passed to `ProtectKVO()`.
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
b->prot_info_->entries_.emplace_back(
|
|
|
|
ProtectionInfo64()
|
2021-09-14 20:13:36 +00:00
|
|
|
.ProtectKVO(key, value, kTypeBlobIndex)
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
.ProtectC(column_family_id));
|
|
|
|
}
|
2017-10-03 16:08:07 +00:00
|
|
|
return save.commit();
|
|
|
|
}
|
|
|
|
|
2017-04-10 22:38:34 +00:00
|
|
|
Status WriteBatch::PutLogData(const Slice& blob) {
|
|
|
|
LocalSavePoint save(this);
|
2013-08-14 23:32:46 +00:00
|
|
|
rep_.push_back(static_cast<char>(kTypeLogData));
|
|
|
|
PutLengthPrefixedSlice(&rep_, blob);
|
2017-04-10 22:38:34 +00:00
|
|
|
return save.commit();
|
2013-08-14 23:32:46 +00:00
|
|
|
}
|
2013-03-21 22:59:47 +00:00
|
|
|
|
2015-07-11 03:15:45 +00:00
|
|
|
void WriteBatch::SetSavePoint() {
|
|
|
|
if (save_points_ == nullptr) {
|
refactor SavePoints (#5192)
Summary:
Savepoints are assumed to be used in a stack-wise fashion (only
the top element should be used), so they were stored by `WriteBatch`
in a member variable `save_points` using an std::stack.
Conceptually this is fine, but the implementation had a few issues:
- the `save_points_` instance variable was a plain pointer to a heap-
allocated `SavePoints` struct. The destructor of `WriteBatch` simply
deletes this pointer. However, the copy constructor of WriteBatch
just copied that pointer, meaning that copying a WriteBatch with
active savepoints will very likely have crashed before. Now a proper
copy of the savepoints is made in the copy constructor, and not just
a copy of the pointer
- `save_points_` was an std::stack, which defaults to `std::deque` for
the underlying container. A deque is a bit over the top here, as we
only need access to the most recent savepoint (i.e. stack.top()) but
never any elements at the front. std::deque is rather expensive to
initialize in common environments. For example, the STL implementation
shipped with GNU g++ will perform a heap allocation of more than 500
bytes to create an empty deque object. Although the `save_points_`
container is created lazily by RocksDB, moving from a deque to a plain
`std::vector` is much more memory-efficient. So `save_points_` is now
a vector.
- `save_points_` was changed from a plain pointer to an `std::unique_ptr`,
making ownership more explicit.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5192
Differential Revision: D15024074
Pulled By: maysamyabandeh
fbshipit-source-id: 5b128786d3789cde94e46465c9e91badd07a25d7
2019-04-20 03:30:03 +00:00
|
|
|
save_points_.reset(new SavePoints());
|
2015-07-11 03:15:45 +00:00
|
|
|
}
|
|
|
|
// Record length and count of current batch of writes.
|
Add facility to write only a portion of WriteBatch to WAL
Summary:
When constructing a write batch a client may now call MarkWalTerminationPoint() on that batch. No batch operations after this call will be added written to the WAL but will still be inserted into the Memtable. This facility is used to remove one of the three WriteImpl calls in 2PC transactions. This produces a ~1% perf improvement.
```
RocksDB - unoptimized 2pc, sync_binlog=1, disable_2pc=off
INFO 2016-08-31 14:30:38,814 [main]: REQUEST PHASE COMPLETED. 75000000 requests done in 2619 seconds. Requests/second = 28628
RocksDB - optimized 2pc , sync_binlog=1, disable_2pc=off
INFO 2016-08-31 16:26:59,442 [main]: REQUEST PHASE COMPLETED. 75000000 requests done in 2581 seconds. Requests/second = 29054
```
Test Plan: Two unit tests added.
Reviewers: sdong, yiwu, IslamAbdelRahman
Reviewed By: yiwu
Subscribers: hermanlee4, dhruba, andrewkr
Differential Revision: https://reviews.facebook.net/D64599
2016-10-07 18:31:26 +00:00
|
|
|
save_points_->stack.push(SavePoint(
|
|
|
|
GetDataSize(), Count(), content_flags_.load(std::memory_order_relaxed)));
|
2015-07-11 03:15:45 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
Status WriteBatch::RollbackToSavePoint() {
|
|
|
|
if (save_points_ == nullptr || save_points_->stack.size() == 0) {
|
|
|
|
return Status::NotFound();
|
|
|
|
}
|
|
|
|
|
|
|
|
// Pop the most recent savepoint off the stack
|
|
|
|
SavePoint savepoint = save_points_->stack.top();
|
|
|
|
save_points_->stack.pop();
|
|
|
|
|
|
|
|
assert(savepoint.size <= rep_.size());
|
2019-09-09 18:22:28 +00:00
|
|
|
assert(static_cast<uint32_t>(savepoint.count) <= Count());
|
2015-07-11 03:15:45 +00:00
|
|
|
|
|
|
|
if (savepoint.size == rep_.size()) {
|
|
|
|
// No changes to rollback
|
|
|
|
} else if (savepoint.size == 0) {
|
|
|
|
// Rollback everything
|
|
|
|
Clear();
|
|
|
|
} else {
|
|
|
|
rep_.resize(savepoint.size);
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
if (prot_info_ != nullptr) {
|
|
|
|
prot_info_->entries_.resize(savepoint.count);
|
|
|
|
}
|
2015-07-11 03:15:45 +00:00
|
|
|
WriteBatchInternal::SetCount(this, savepoint.count);
|
2015-11-06 15:03:30 +00:00
|
|
|
content_flags_.store(savepoint.content_flags, std::memory_order_relaxed);
|
2015-07-11 03:15:45 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
2017-05-03 17:54:07 +00:00
|
|
|
Status WriteBatch::PopSavePoint() {
|
|
|
|
if (save_points_ == nullptr || save_points_->stack.size() == 0) {
|
|
|
|
return Status::NotFound();
|
|
|
|
}
|
|
|
|
|
|
|
|
// Pop the most recent savepoint off the stack
|
|
|
|
save_points_->stack.pop();
|
|
|
|
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
Status WriteBatch::UpdateTimestamps(
|
|
|
|
const Slice& ts, std::function<size_t(uint32_t)> ts_sz_func) {
|
|
|
|
TimestampUpdater<decltype(ts_sz_func)> ts_updater(prot_info_.get(),
|
|
|
|
std::move(ts_sz_func), ts);
|
|
|
|
const Status s = Iterate(&ts_updater);
|
|
|
|
if (s.ok()) {
|
|
|
|
needs_in_place_update_ts_ = false;
|
|
|
|
}
|
|
|
|
return s;
|
2021-12-11 04:32:31 +00:00
|
|
|
}
|
|
|
|
|
2022-06-15 20:43:58 +00:00
|
|
|
Status WriteBatch::VerifyChecksum() const {
|
|
|
|
if (prot_info_ == nullptr) {
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
Slice input(rep_.data() + WriteBatchInternal::kHeader,
|
|
|
|
rep_.size() - WriteBatchInternal::kHeader);
|
|
|
|
Slice key, value, blob, xid;
|
|
|
|
char tag = 0;
|
|
|
|
uint32_t column_family = 0; // default
|
|
|
|
Status s;
|
|
|
|
size_t prot_info_idx = 0;
|
|
|
|
bool checksum_protected = true;
|
|
|
|
while (!input.empty() && prot_info_idx < prot_info_->entries_.size()) {
|
|
|
|
// In case key/value/column_family are not updated by
|
|
|
|
// ReadRecordFromWriteBatch
|
|
|
|
key.clear();
|
|
|
|
value.clear();
|
|
|
|
column_family = 0;
|
|
|
|
s = ReadRecordFromWriteBatch(&input, &tag, &column_family, &key, &value,
|
2024-04-30 22:40:35 +00:00
|
|
|
&blob, &xid, /*write_unix_time=*/nullptr);
|
2022-06-15 20:43:58 +00:00
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
checksum_protected = true;
|
|
|
|
// Write batch checksum uses op_type without ColumnFamily (e.g., if op_type
|
|
|
|
// in the write batch is kTypeColumnFamilyValue, kTypeValue is used to
|
|
|
|
// compute the checksum), and encodes column family id separately. See
|
|
|
|
// comment in first `WriteBatchInternal::Put()` for more detail.
|
|
|
|
switch (tag) {
|
|
|
|
case kTypeColumnFamilyValue:
|
|
|
|
case kTypeValue:
|
|
|
|
tag = kTypeValue;
|
|
|
|
break;
|
|
|
|
case kTypeColumnFamilyDeletion:
|
|
|
|
case kTypeDeletion:
|
|
|
|
tag = kTypeDeletion;
|
|
|
|
break;
|
|
|
|
case kTypeColumnFamilySingleDeletion:
|
|
|
|
case kTypeSingleDeletion:
|
|
|
|
tag = kTypeSingleDeletion;
|
|
|
|
break;
|
|
|
|
case kTypeColumnFamilyRangeDeletion:
|
|
|
|
case kTypeRangeDeletion:
|
|
|
|
tag = kTypeRangeDeletion;
|
|
|
|
break;
|
|
|
|
case kTypeColumnFamilyMerge:
|
|
|
|
case kTypeMerge:
|
|
|
|
tag = kTypeMerge;
|
|
|
|
break;
|
|
|
|
case kTypeColumnFamilyBlobIndex:
|
|
|
|
case kTypeBlobIndex:
|
|
|
|
tag = kTypeBlobIndex;
|
|
|
|
break;
|
|
|
|
case kTypeLogData:
|
|
|
|
case kTypeBeginPrepareXID:
|
|
|
|
case kTypeEndPrepareXID:
|
|
|
|
case kTypeCommitXID:
|
|
|
|
case kTypeRollbackXID:
|
|
|
|
case kTypeNoop:
|
|
|
|
case kTypeBeginPersistedPrepareXID:
|
|
|
|
case kTypeBeginUnprepareXID:
|
|
|
|
case kTypeDeletionWithTimestamp:
|
|
|
|
case kTypeCommitXIDAndTimestamp:
|
|
|
|
checksum_protected = false;
|
|
|
|
break;
|
2022-06-25 22:30:47 +00:00
|
|
|
case kTypeColumnFamilyWideColumnEntity:
|
|
|
|
case kTypeWideColumnEntity:
|
|
|
|
tag = kTypeWideColumnEntity;
|
|
|
|
break;
|
Add initial support for TimedPut API (#12419)
Summary:
This PR adds support for `TimedPut` API. We introduced a new type `kTypeValuePreferredSeqno` for entries added to the DB via the `TimedPut` API.
The life cycle of such an entry on the write/flush/compaction paths are:
1) It is initially added to memtable as:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, write_unix_time}`
2) When it's flushed to L0 sst files, it's converted to:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, preferred_seqno}`
when we have easy access to the seqno to time mapping.
3) During compaction, if certain conditions are met, we swap in the `preferred_seqno` and the entry will become:
`<user_key, preferred_seqno, kTypeValue>: value`. This step helps fast track these entries to the cold tier if they are eligible after the sequence number swap.
On the read path:
A `kTypeValuePreferredSeqno` entry acts the same as a `kTypeValue` entry, the unix_write_time/preferred seqno part packed in value is completely ignored.
Needed follow ups:
1) The seqno to time mapping accessible in flush needs to be extended to cover the `write_unix_time` for possible `kTypeValuePreferredSeqno` entries. This also means we need to track these `write_unix_time` in memtable.
2) Compaction filter support for the new `kTypeValuePreferredSeqno` type for feature parity with other `kTypeValue` and equivalent types.
3) Stress test coverage for the feature
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12419
Test Plan: Added unit tests
Reviewed By: pdillinger
Differential Revision: D54920296
Pulled By: jowlyzhang
fbshipit-source-id: c8b43f7a7c465e569141770e93c748371ff1da9e
2024-03-14 22:44:55 +00:00
|
|
|
case kTypeColumnFamilyValuePreferredSeqno:
|
|
|
|
case kTypeValuePreferredSeqno:
|
|
|
|
tag = kTypeValuePreferredSeqno;
|
|
|
|
break;
|
2022-06-15 20:43:58 +00:00
|
|
|
default:
|
|
|
|
return Status::Corruption(
|
|
|
|
"unknown WriteBatch tag",
|
|
|
|
std::to_string(static_cast<unsigned int>(tag)));
|
|
|
|
}
|
|
|
|
if (checksum_protected) {
|
|
|
|
s = prot_info_->entries_[prot_info_idx++]
|
|
|
|
.StripC(column_family)
|
|
|
|
.StripKVO(key, value, static_cast<ValueType>(tag))
|
|
|
|
.GetStatus();
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (prot_info_idx != WriteBatchInternal::Count(this)) {
|
|
|
|
return Status::Corruption("WriteBatch has wrong count");
|
|
|
|
}
|
|
|
|
assert(WriteBatchInternal::Count(this) == prot_info_->entries_.size());
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
2022-04-28 21:42:00 +00:00
|
|
|
namespace {
|
|
|
|
|
2011-05-21 02:17:43 +00:00
|
|
|
class MemTableInserter : public WriteBatch::Handler {
|
|
|
|
SequenceNumber sequence_;
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-14 23:59:07 +00:00
|
|
|
ColumnFamilyMemTables* const cf_mems_;
|
|
|
|
FlushScheduler* const flush_scheduler_;
|
Refactor trimming logic for immutable memtables (#5022)
Summary:
MyRocks currently sets `max_write_buffer_number_to_maintain` in order to maintain enough history for transaction conflict checking. The effectiveness of this approach depends on the size of memtables. When memtables are small, it may not keep enough history; when memtables are large, this may consume too much memory.
We are proposing a new way to configure memtable list history: by limiting the memory usage of immutable memtables. The new option is `max_write_buffer_size_to_maintain` and it will take precedence over the old `max_write_buffer_number_to_maintain` if they are both set to non-zero values. The new option accounts for the total memory usage of flushed immutable memtables and mutable memtable. When the total usage exceeds the limit, RocksDB may start dropping immutable memtables (which is also called trimming history), starting from the oldest one.
The semantics of the old option actually works both as an upper bound and lower bound. History trimming will start if number of immutable memtables exceeds the limit, but it will never go below (limit-1) due to history trimming.
In order the mimic the behavior with the new option, history trimming will stop if dropping the next immutable memtable causes the total memory usage go below the size limit. For example, assuming the size limit is set to 64MB, and there are 3 immutable memtables with sizes of 20, 30, 30. Although the total memory usage is 80MB > 64MB, dropping the oldest memtable will reduce the memory usage to 60MB < 64MB, so in this case no memtable will be dropped.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5022
Differential Revision: D14394062
Pulled By: miasantreble
fbshipit-source-id: 60457a509c6af89d0993f988c9b5c2aa9e45f5c5
2019-08-23 20:54:09 +00:00
|
|
|
TrimHistoryScheduler* const trim_history_scheduler_;
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-14 23:59:07 +00:00
|
|
|
const bool ignore_missing_column_families_;
|
2016-04-18 18:11:51 +00:00
|
|
|
const uint64_t recovering_log_number_;
|
|
|
|
// log number that all Memtables inserted into should reference
|
|
|
|
uint64_t log_number_ref_;
|
2013-07-12 23:56:52 +00:00
|
|
|
DBImpl* db_;
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-14 23:59:07 +00:00
|
|
|
const bool concurrent_memtable_writes_;
|
2022-11-02 21:34:24 +00:00
|
|
|
bool post_info_created_;
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
const WriteBatch::ProtectionInfo* prot_info_;
|
|
|
|
size_t prot_info_idx_;
|
2017-03-22 18:07:52 +00:00
|
|
|
|
Ignore stale logs while restarting DBs
Summary:
Stale log files can be deleted out of order. This can happen for various reasons. One of the reason is that no data is ever inserted to a column family and we have an optimization to update its log number, but not all the old log files are cleaned up (the case shown in the unit tests added). It can also happen when we simply delete multiple log files out of order.
This causes data corruption because we simply increase seqID after processing the next row and we may end up with writing data with smaller seqID than what is already flushed to memtables.
In DB recovery, for the oldest files we are replaying, if there it contains no data for any column family, we ignore the sequence IDs in the file.
Test Plan: Add two unit tests that fail without the fix.
Reviewers: IslamAbdelRahman, igor, yiwu
Reviewed By: yiwu
Subscribers: hermanlee4, yoshinorim, leveldb, andrewkr, dhruba
Differential Revision: https://reviews.facebook.net/D60891
2016-07-19 18:48:00 +00:00
|
|
|
bool* has_valid_writes_;
|
2017-03-22 18:07:52 +00:00
|
|
|
// On some (!) platforms just default creating
|
|
|
|
// a map is too expensive in the Write() path as they
|
|
|
|
// cause memory allocations though unused.
|
|
|
|
// Make creation optional but do not incur
|
2018-11-09 19:17:34 +00:00
|
|
|
// std::unique_ptr additional allocation
|
2017-10-03 03:40:23 +00:00
|
|
|
using MemPostInfoMap = std::map<MemTable*, MemTablePostProcessInfo>;
|
2024-05-28 22:33:57 +00:00
|
|
|
using PostMapType = aligned_storage<MemPostInfoMap>::type;
|
2017-03-22 18:07:52 +00:00
|
|
|
PostMapType mem_post_info_map_;
|
2016-04-18 18:11:51 +00:00
|
|
|
// current recovered transaction we are rebuilding (recovery)
|
|
|
|
WriteBatch* rebuilding_trx_;
|
2017-09-28 23:43:04 +00:00
|
|
|
SequenceNumber rebuilding_trx_seq_;
|
2017-09-18 21:36:53 +00:00
|
|
|
// Increase seq number once per each write batch. Otherwise increase it once
|
|
|
|
// per key.
|
|
|
|
bool seq_per_batch_;
|
2017-09-28 23:43:04 +00:00
|
|
|
// Whether the memtable write will be done only after the commit
|
|
|
|
bool write_after_commit_;
|
2018-06-29 01:46:39 +00:00
|
|
|
// Whether memtable write can be done before prepare
|
|
|
|
bool write_before_prepare_;
|
2018-07-07 00:17:36 +00:00
|
|
|
// Whether this batch was unprepared or not
|
|
|
|
bool unprepared_batch_;
|
2024-05-28 22:33:57 +00:00
|
|
|
using DupDetector = aligned_storage<DuplicateDetector>::type;
|
2022-11-02 21:34:24 +00:00
|
|
|
DupDetector duplicate_detector_;
|
|
|
|
bool dup_dectector_on_;
|
2011-05-21 02:17:43 +00:00
|
|
|
|
2019-09-12 23:53:31 +00:00
|
|
|
bool hint_per_batch_;
|
|
|
|
bool hint_created_;
|
|
|
|
// Hints for this batch
|
|
|
|
using HintMap = std::unordered_map<MemTable*, void*>;
|
2024-05-28 22:33:57 +00:00
|
|
|
using HintMapType = aligned_storage<HintMap>::type;
|
2019-09-12 23:53:31 +00:00
|
|
|
HintMapType hint_;
|
|
|
|
|
|
|
|
HintMap& GetHintMap() {
|
|
|
|
assert(hint_per_batch_);
|
|
|
|
if (!hint_created_) {
|
|
|
|
new (&hint_) HintMap();
|
|
|
|
hint_created_ = true;
|
|
|
|
}
|
|
|
|
return *reinterpret_cast<HintMap*>(&hint_);
|
|
|
|
}
|
|
|
|
|
2017-03-22 18:07:52 +00:00
|
|
|
MemPostInfoMap& GetPostMap() {
|
|
|
|
assert(concurrent_memtable_writes_);
|
2022-11-02 21:34:24 +00:00
|
|
|
if (!post_info_created_) {
|
2017-03-22 18:07:52 +00:00
|
|
|
new (&mem_post_info_map_) MemPostInfoMap();
|
|
|
|
post_info_created_ = true;
|
|
|
|
}
|
|
|
|
return *reinterpret_cast<MemPostInfoMap*>(&mem_post_info_map_);
|
|
|
|
}
|
|
|
|
|
2018-03-14 07:55:04 +00:00
|
|
|
bool IsDuplicateKeySeq(uint32_t column_family_id, const Slice& key) {
|
|
|
|
assert(!write_after_commit_);
|
|
|
|
assert(rebuilding_trx_ != nullptr);
|
|
|
|
if (!dup_dectector_on_) {
|
|
|
|
new (&duplicate_detector_) DuplicateDetector(db_);
|
|
|
|
dup_dectector_on_ = true;
|
|
|
|
}
|
2022-11-02 21:34:24 +00:00
|
|
|
return reinterpret_cast<DuplicateDetector*>(&duplicate_detector_)
|
|
|
|
->IsDuplicateKeySeq(column_family_id, key, sequence_);
|
2018-03-14 07:55:04 +00:00
|
|
|
}
|
|
|
|
|
2021-09-14 20:13:36 +00:00
|
|
|
const ProtectionInfoKVOC64* NextProtectionInfo() {
|
|
|
|
const ProtectionInfoKVOC64* res = nullptr;
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
if (prot_info_ != nullptr) {
|
|
|
|
assert(prot_info_idx_ < prot_info_->entries_.size());
|
|
|
|
res = &prot_info_->entries_[prot_info_idx_];
|
|
|
|
++prot_info_idx_;
|
|
|
|
}
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
2022-04-28 21:42:00 +00:00
|
|
|
void DecrementProtectionInfoIdxForTryAgain() {
|
2023-12-04 19:17:32 +00:00
|
|
|
if (prot_info_ != nullptr) {
|
|
|
|
--prot_info_idx_;
|
|
|
|
}
|
2022-04-28 21:42:00 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void ResetProtectionInfo() {
|
|
|
|
prot_info_idx_ = 0;
|
|
|
|
prot_info_ = nullptr;
|
|
|
|
}
|
|
|
|
|
2017-11-11 19:23:43 +00:00
|
|
|
protected:
|
2022-04-28 21:42:00 +00:00
|
|
|
Handler::OptionState WriteBeforePrepare() const override {
|
|
|
|
return write_before_prepare_ ? Handler::OptionState::kEnabled
|
|
|
|
: Handler::OptionState::kDisabled;
|
|
|
|
}
|
|
|
|
Handler::OptionState WriteAfterCommit() const override {
|
|
|
|
return write_after_commit_ ? Handler::OptionState::kEnabled
|
|
|
|
: Handler::OptionState::kDisabled;
|
|
|
|
}
|
2017-11-11 19:23:43 +00:00
|
|
|
|
2017-09-18 21:36:53 +00:00
|
|
|
public:
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-14 23:59:07 +00:00
|
|
|
// cf_mems should not be shared with concurrent inserters
|
2017-09-18 21:36:53 +00:00
|
|
|
MemTableInserter(SequenceNumber _sequence, ColumnFamilyMemTables* cf_mems,
|
|
|
|
FlushScheduler* flush_scheduler,
|
Refactor trimming logic for immutable memtables (#5022)
Summary:
MyRocks currently sets `max_write_buffer_number_to_maintain` in order to maintain enough history for transaction conflict checking. The effectiveness of this approach depends on the size of memtables. When memtables are small, it may not keep enough history; when memtables are large, this may consume too much memory.
We are proposing a new way to configure memtable list history: by limiting the memory usage of immutable memtables. The new option is `max_write_buffer_size_to_maintain` and it will take precedence over the old `max_write_buffer_number_to_maintain` if they are both set to non-zero values. The new option accounts for the total memory usage of flushed immutable memtables and mutable memtable. When the total usage exceeds the limit, RocksDB may start dropping immutable memtables (which is also called trimming history), starting from the oldest one.
The semantics of the old option actually works both as an upper bound and lower bound. History trimming will start if number of immutable memtables exceeds the limit, but it will never go below (limit-1) due to history trimming.
In order the mimic the behavior with the new option, history trimming will stop if dropping the next immutable memtable causes the total memory usage go below the size limit. For example, assuming the size limit is set to 64MB, and there are 3 immutable memtables with sizes of 20, 30, 30. Although the total memory usage is 80MB > 64MB, dropping the oldest memtable will reduce the memory usage to 60MB < 64MB, so in this case no memtable will be dropped.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5022
Differential Revision: D14394062
Pulled By: miasantreble
fbshipit-source-id: 60457a509c6af89d0993f988c9b5c2aa9e45f5c5
2019-08-23 20:54:09 +00:00
|
|
|
TrimHistoryScheduler* trim_history_scheduler,
|
2017-09-18 21:36:53 +00:00
|
|
|
bool ignore_missing_column_families,
|
|
|
|
uint64_t recovering_log_number, DB* db,
|
|
|
|
bool concurrent_memtable_writes,
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
const WriteBatch::ProtectionInfo* prot_info,
|
2018-06-29 01:46:39 +00:00
|
|
|
bool* has_valid_writes = nullptr, bool seq_per_batch = false,
|
2019-09-12 23:53:31 +00:00
|
|
|
bool batch_per_txn = true, bool hint_per_batch = false)
|
2017-09-18 21:36:53 +00:00
|
|
|
: sequence_(_sequence),
|
|
|
|
cf_mems_(cf_mems),
|
|
|
|
flush_scheduler_(flush_scheduler),
|
Refactor trimming logic for immutable memtables (#5022)
Summary:
MyRocks currently sets `max_write_buffer_number_to_maintain` in order to maintain enough history for transaction conflict checking. The effectiveness of this approach depends on the size of memtables. When memtables are small, it may not keep enough history; when memtables are large, this may consume too much memory.
We are proposing a new way to configure memtable list history: by limiting the memory usage of immutable memtables. The new option is `max_write_buffer_size_to_maintain` and it will take precedence over the old `max_write_buffer_number_to_maintain` if they are both set to non-zero values. The new option accounts for the total memory usage of flushed immutable memtables and mutable memtable. When the total usage exceeds the limit, RocksDB may start dropping immutable memtables (which is also called trimming history), starting from the oldest one.
The semantics of the old option actually works both as an upper bound and lower bound. History trimming will start if number of immutable memtables exceeds the limit, but it will never go below (limit-1) due to history trimming.
In order the mimic the behavior with the new option, history trimming will stop if dropping the next immutable memtable causes the total memory usage go below the size limit. For example, assuming the size limit is set to 64MB, and there are 3 immutable memtables with sizes of 20, 30, 30. Although the total memory usage is 80MB > 64MB, dropping the oldest memtable will reduce the memory usage to 60MB < 64MB, so in this case no memtable will be dropped.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5022
Differential Revision: D14394062
Pulled By: miasantreble
fbshipit-source-id: 60457a509c6af89d0993f988c9b5c2aa9e45f5c5
2019-08-23 20:54:09 +00:00
|
|
|
trim_history_scheduler_(trim_history_scheduler),
|
2017-09-18 21:36:53 +00:00
|
|
|
ignore_missing_column_families_(ignore_missing_column_families),
|
|
|
|
recovering_log_number_(recovering_log_number),
|
|
|
|
log_number_ref_(0),
|
2020-04-29 20:06:27 +00:00
|
|
|
db_(static_cast_with_check<DBImpl>(db)),
|
2017-09-18 21:36:53 +00:00
|
|
|
concurrent_memtable_writes_(concurrent_memtable_writes),
|
|
|
|
post_info_created_(false),
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
prot_info_(prot_info),
|
|
|
|
prot_info_idx_(0),
|
2017-09-18 21:36:53 +00:00
|
|
|
has_valid_writes_(has_valid_writes),
|
|
|
|
rebuilding_trx_(nullptr),
|
2017-12-07 19:52:12 +00:00
|
|
|
rebuilding_trx_seq_(0),
|
2017-09-28 23:43:04 +00:00
|
|
|
seq_per_batch_(seq_per_batch),
|
|
|
|
// Write after commit currently uses one seq per key (instead of per
|
|
|
|
// batch). So seq_per_batch being false indicates write_after_commit
|
|
|
|
// approach.
|
2018-03-05 18:48:29 +00:00
|
|
|
write_after_commit_(!seq_per_batch),
|
2018-06-29 01:46:39 +00:00
|
|
|
// WriteUnprepared can write WriteBatches per transaction, so
|
|
|
|
// batch_per_txn being false indicates write_before_prepare.
|
|
|
|
write_before_prepare_(!batch_per_txn),
|
2018-07-07 00:17:36 +00:00
|
|
|
unprepared_batch_(false),
|
2018-03-14 07:55:04 +00:00
|
|
|
duplicate_detector_(),
|
2019-09-12 23:53:31 +00:00
|
|
|
dup_dectector_on_(false),
|
|
|
|
hint_per_batch_(hint_per_batch),
|
|
|
|
hint_created_(false) {
|
2017-09-18 21:36:53 +00:00
|
|
|
assert(cf_mems_);
|
2014-01-28 19:05:04 +00:00
|
|
|
}
|
|
|
|
|
2019-02-19 21:36:04 +00:00
|
|
|
~MemTableInserter() override {
|
2018-03-14 07:55:04 +00:00
|
|
|
if (dup_dectector_on_) {
|
2022-11-02 21:34:24 +00:00
|
|
|
reinterpret_cast<DuplicateDetector*>(&duplicate_detector_)
|
|
|
|
->~DuplicateDetector();
|
2018-03-14 07:55:04 +00:00
|
|
|
}
|
2017-03-22 18:07:52 +00:00
|
|
|
if (post_info_created_) {
|
2022-11-02 21:34:24 +00:00
|
|
|
reinterpret_cast<MemPostInfoMap*>(&mem_post_info_map_)->~MemPostInfoMap();
|
2017-03-22 18:07:52 +00:00
|
|
|
}
|
2019-09-12 23:53:31 +00:00
|
|
|
if (hint_created_) {
|
|
|
|
for (auto iter : GetHintMap()) {
|
|
|
|
delete[] reinterpret_cast<char*>(iter.second);
|
|
|
|
}
|
|
|
|
reinterpret_cast<HintMap*>(&hint_)->~HintMap();
|
|
|
|
}
|
2017-11-14 16:42:14 +00:00
|
|
|
delete rebuilding_trx_;
|
2017-03-22 18:07:52 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
MemTableInserter(const MemTableInserter&) = delete;
|
|
|
|
MemTableInserter& operator=(const MemTableInserter&) = delete;
|
|
|
|
|
2017-11-15 16:19:57 +00:00
|
|
|
// The batch seq is regularly restarted; In normal mode it is set when
|
|
|
|
// MemTableInserter is constructed in the write thread and in recovery mode it
|
|
|
|
// is set when a batch, which is tagged with seq, is read from the WAL.
|
|
|
|
// Within a sequenced batch, which could be a merge of multiple batches, we
|
|
|
|
// have two policies to advance the seq: i) seq_per_key (default) and ii)
|
2018-03-08 18:18:34 +00:00
|
|
|
// seq_per_batch. To implement the latter we need to mark the boundary between
|
2017-11-15 16:19:57 +00:00
|
|
|
// the individual batches. The approach is this: 1) Use the terminating
|
2018-03-08 18:18:34 +00:00
|
|
|
// markers to indicate the boundary (kTypeEndPrepareXID, kTypeCommitXID,
|
|
|
|
// kTypeRollbackXID) 2) Terminate a batch with kTypeNoop in the absence of a
|
|
|
|
// natural boundary marker.
|
2017-09-18 21:36:53 +00:00
|
|
|
void MaybeAdvanceSeq(bool batch_boundry = false) {
|
|
|
|
if (batch_boundry == seq_per_batch_) {
|
|
|
|
sequence_++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-04-18 18:11:51 +00:00
|
|
|
void set_log_number_ref(uint64_t log) { log_number_ref_ = log; }
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
void set_prot_info(const WriteBatch::ProtectionInfo* prot_info) {
|
|
|
|
prot_info_ = prot_info;
|
|
|
|
prot_info_idx_ = 0;
|
|
|
|
}
|
2016-04-18 18:11:51 +00:00
|
|
|
|
2017-05-31 17:45:47 +00:00
|
|
|
SequenceNumber sequence() const { return sequence_; }
|
[rocksdb] Recovery path sequence miscount fix
Summary:
Consider the following WAL with 4 batch entries prefixed with their sequence at time of memtable insert.
[1: BEGIN_PREPARE, PUT, PUT, PUT, PUT, END_PREPARE(a)]
[1: BEGIN_PREPARE, PUT, PUT, PUT, PUT, END_PREPARE(b)]
[4: COMMIT(a)]
[7: COMMIT(b)]
The first two batches do not consume any sequence numbers so are both prefixed with seq=1.
For 2pc commit, memtable insertion takes place before COMMIT batch is written to WAL.
We can see that sequence number consumption takes place between WAL entries giving us the seemingly sparse sequence prefix for WAL entries.
This is a valid WAL.
Because with 2PC markers one WriteBatch points to another batch containing its inserts a writebatch can consume more or less sequence numbers than the number of sequence consuming entries that it contains.
We can see that, given the entries in the WAL, 6 sequence ids were consumed. Yet on recovery the maximum sequence consumed would be 7 + 3 (the number of sequence numbers consumed by COMMIT(b))
So, now upon recovery we must track the actual consumption of sequence numbers.
In the provided scenario there will be no sequence gaps, but it is possible to produce a sequence gap. This should not be a problem though. correct?
Test Plan: provided test.
Reviewers: sdong
Subscribers: andrewkr, leveldb, dhruba, hermanlee4
Differential Revision: https://reviews.facebook.net/D57645
2016-05-04 21:02:27 +00:00
|
|
|
|
2016-07-07 21:45:29 +00:00
|
|
|
void PostProcess() {
|
2017-03-22 18:07:52 +00:00
|
|
|
assert(concurrent_memtable_writes_);
|
|
|
|
// If post info was not created there is nothing
|
|
|
|
// to process and no need to create on demand
|
2022-11-02 21:34:24 +00:00
|
|
|
if (post_info_created_) {
|
2017-03-22 18:07:52 +00:00
|
|
|
for (auto& pair : GetPostMap()) {
|
|
|
|
pair.first->BatchPostProcess(pair.second);
|
|
|
|
}
|
2016-07-07 21:45:29 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-03-03 22:30:36 +00:00
|
|
|
bool SeekToColumnFamily(uint32_t column_family_id, Status* s) {
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-14 23:59:07 +00:00
|
|
|
// If we are in a concurrent mode, it is the caller's responsibility
|
|
|
|
// to clone the original ColumnFamilyMemTables so that each thread
|
|
|
|
// has its own instance. Otherwise, it must be guaranteed that there
|
|
|
|
// is no concurrent access
|
2014-02-06 00:02:48 +00:00
|
|
|
bool found = cf_mems_->Seek(column_family_id);
|
2014-09-02 20:29:05 +00:00
|
|
|
if (!found) {
|
|
|
|
if (ignore_missing_column_families_) {
|
|
|
|
*s = Status::OK();
|
|
|
|
} else {
|
|
|
|
*s = Status::InvalidArgument(
|
|
|
|
"Invalid column family specified in write batch");
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
2016-04-18 18:11:51 +00:00
|
|
|
if (recovering_log_number_ != 0 &&
|
|
|
|
recovering_log_number_ < cf_mems_->GetLogNumber()) {
|
|
|
|
// This is true only in recovery environment (recovering_log_number_ is
|
|
|
|
// always 0 in
|
2014-09-02 20:29:05 +00:00
|
|
|
// non-recovery, regular write code-path)
|
2016-04-18 18:11:51 +00:00
|
|
|
// * If recovering_log_number_ < cf_mems_->GetLogNumber(), this means that
|
2022-09-30 23:13:03 +00:00
|
|
|
// column family already contains updates from this log. We can't apply
|
|
|
|
// updates twice because of update-in-place or merge workloads -- ignore
|
|
|
|
// the update
|
2014-03-03 22:30:36 +00:00
|
|
|
*s = Status::OK();
|
|
|
|
return false;
|
|
|
|
}
|
2016-04-18 18:11:51 +00:00
|
|
|
|
Ignore stale logs while restarting DBs
Summary:
Stale log files can be deleted out of order. This can happen for various reasons. One of the reason is that no data is ever inserted to a column family and we have an optimization to update its log number, but not all the old log files are cleaned up (the case shown in the unit tests added). It can also happen when we simply delete multiple log files out of order.
This causes data corruption because we simply increase seqID after processing the next row and we may end up with writing data with smaller seqID than what is already flushed to memtables.
In DB recovery, for the oldest files we are replaying, if there it contains no data for any column family, we ignore the sequence IDs in the file.
Test Plan: Add two unit tests that fail without the fix.
Reviewers: IslamAbdelRahman, igor, yiwu
Reviewed By: yiwu
Subscribers: hermanlee4, yoshinorim, leveldb, andrewkr, dhruba
Differential Revision: https://reviews.facebook.net/D60891
2016-07-19 18:48:00 +00:00
|
|
|
if (has_valid_writes_ != nullptr) {
|
|
|
|
*has_valid_writes_ = true;
|
|
|
|
}
|
|
|
|
|
2016-04-18 18:11:51 +00:00
|
|
|
if (log_number_ref_ > 0) {
|
|
|
|
cf_mems_->GetMemTable()->RefLogContainingPrepSection(log_number_ref_);
|
|
|
|
}
|
|
|
|
|
2014-03-03 22:30:36 +00:00
|
|
|
return true;
|
|
|
|
}
|
2015-11-06 15:29:10 +00:00
|
|
|
|
2024-05-22 00:22:20 +00:00
|
|
|
template <typename RebuildTxnOp>
|
2017-10-03 16:08:07 +00:00
|
|
|
Status PutCFImpl(uint32_t column_family_id, const Slice& key,
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
const Slice& value, ValueType value_type,
|
2024-05-22 18:06:52 +00:00
|
|
|
RebuildTxnOp rebuild_txn_op,
|
2021-09-14 20:13:36 +00:00
|
|
|
const ProtectionInfoKVOS64* kv_prot_info) {
|
2018-03-05 18:48:29 +00:00
|
|
|
// optimize for non-recovery mode
|
|
|
|
if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) {
|
2021-09-14 20:13:36 +00:00
|
|
|
// TODO(ajkr): propagate `ProtectionInfoKVOS64`.
|
2024-05-22 18:06:52 +00:00
|
|
|
return rebuild_txn_op(rebuilding_trx_, column_family_id, key, value);
|
2017-09-28 23:43:04 +00:00
|
|
|
// else insert the values to the memtable right away
|
2016-05-05 19:48:52 +00:00
|
|
|
}
|
|
|
|
|
2020-08-24 23:41:42 +00:00
|
|
|
Status ret_status;
|
|
|
|
if (UNLIKELY(!SeekToColumnFamily(column_family_id, &ret_status))) {
|
2020-11-24 00:27:46 +00:00
|
|
|
if (ret_status.ok() && rebuilding_trx_ != nullptr) {
|
2018-03-05 18:48:29 +00:00
|
|
|
assert(!write_after_commit_);
|
2018-03-08 18:18:34 +00:00
|
|
|
// The CF is probably flushed and hence no need for insert but we still
|
2018-03-05 18:48:29 +00:00
|
|
|
// need to keep track of the keys for upcoming rollback/commit.
|
2021-09-14 20:13:36 +00:00
|
|
|
// TODO(ajkr): propagate `ProtectionInfoKVOS64`.
|
2024-05-22 00:22:20 +00:00
|
|
|
ret_status =
|
2024-05-22 18:06:52 +00:00
|
|
|
rebuild_txn_op(rebuilding_trx_, column_family_id, key, value);
|
2020-11-24 00:27:46 +00:00
|
|
|
if (ret_status.ok()) {
|
|
|
|
MaybeAdvanceSeq(IsDuplicateKeySeq(column_family_id, key));
|
|
|
|
}
|
|
|
|
} else if (ret_status.ok()) {
|
|
|
|
MaybeAdvanceSeq(false /* batch_boundary */);
|
2018-03-05 18:48:29 +00:00
|
|
|
}
|
2020-08-24 23:41:42 +00:00
|
|
|
return ret_status;
|
2014-01-28 19:05:04 +00:00
|
|
|
}
|
2020-11-24 00:27:46 +00:00
|
|
|
assert(ret_status.ok());
|
2016-04-18 18:11:51 +00:00
|
|
|
|
2014-02-06 00:02:48 +00:00
|
|
|
MemTable* mem = cf_mems_->GetMemTable();
|
2017-11-03 05:16:23 +00:00
|
|
|
auto* moptions = mem->GetImmutableMemTableOptions();
|
2018-02-06 02:32:54 +00:00
|
|
|
// inplace_update_support is inconsistent with snapshots, and therefore with
|
|
|
|
// any kind of transactions including the ones that use seq_per_batch
|
|
|
|
assert(!seq_per_batch_ || !moptions->inplace_update_support);
|
2014-09-09 01:46:52 +00:00
|
|
|
if (!moptions->inplace_update_support) {
|
2020-11-24 00:27:46 +00:00
|
|
|
ret_status =
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
mem->Add(sequence_, value_type, key, value, kv_prot_info,
|
2019-09-12 23:53:31 +00:00
|
|
|
concurrent_memtable_writes_, get_post_process_info(mem),
|
|
|
|
hint_per_batch_ ? &GetHintMap()[mem] : nullptr);
|
2022-06-27 23:37:09 +00:00
|
|
|
} else if (moptions->inplace_callback == nullptr ||
|
|
|
|
value_type != kTypeValue) {
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-14 23:59:07 +00:00
|
|
|
assert(!concurrent_memtable_writes_);
|
2022-06-27 23:37:09 +00:00
|
|
|
ret_status = mem->Update(sequence_, value_type, key, value, kv_prot_info);
|
In-place updates for equal keys and similar sized values
Summary:
Currently for each put, a fresh memory is allocated, and a new entry is added to the memtable with a new sequence number irrespective of whether the key already exists in the memtable. This diff is an attempt to update the value inplace for existing keys. It currently handles a very simple case:
1. Key already exists in the current memtable. Does not inplace update values in immutable memtable or snapshot
2. Latest value type is a 'put' ie kTypeValue
3. New value size is less than existing value, to avoid reallocating memory
TODO: For a put of an existing key, deallocate memory take by values, for other value types till a kTypeValue is found, ie. remove kTypeMerge.
TODO: Update the transaction log, to allow consistent reload of the memtable.
Test Plan: Added a unit test verifying the inplace update. But some other unit tests broken due to invalid sequence number checks. WIll fix them next.
Reviewers: xinyaohu, sumeet, haobo, dhruba
CC: leveldb
Differential Revision: https://reviews.facebook.net/D12423
Automatic commit by arc
2013-08-19 21:12:47 +00:00
|
|
|
} else {
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-14 23:59:07 +00:00
|
|
|
assert(!concurrent_memtable_writes_);
|
2022-06-27 23:37:09 +00:00
|
|
|
assert(value_type == kTypeValue);
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
ret_status = mem->UpdateCallback(sequence_, key, value, kv_prot_info);
|
2020-11-24 00:27:46 +00:00
|
|
|
if (ret_status.IsNotFound()) {
|
Allow callback to change size of existing value. Change return type of the callback function to an enum status to handle 3 cases.
Summary:
This diff fixes 2 hacks:
* The callback function can modify the existing value inplace, if the merged value fits within the existing buffer size. But currently the existing buffer size is not being modified. Now the callback recieves a int* allowing the size to be modified. Since size is encoded as a varint in the internal key for memtable. It might happen that the entire value might have be copied to the new location if the new size varint is smaller than the existing size varint.
* The callback function has 3 functionalities
1. Modify existing buffer inplace, and update size correspondingly. Now to indicate that, Returns 1.
2. Generate a new buffer indicating merged value. Returns 2.
3. Fails to do either of above, based on whatever application logic. Returns 0.
Test Plan: Just make all for now. I'm adding another unit test to test each scenario.
Reviewers: dhruba, haobo
Reviewed By: haobo
CC: leveldb, sdong, kailiu, xinyaohu, sumeet, danguo
Differential Revision: https://reviews.facebook.net/D15195
2014-01-16 23:11:19 +00:00
|
|
|
// key not found in memtable. Do sst get, update, add
|
2014-01-14 15:55:16 +00:00
|
|
|
SnapshotImpl read_from_snapshot;
|
|
|
|
read_from_snapshot.number_ = sequence_;
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
// TODO: plumb Env::IOActivity, Env::IOPriority
|
2014-01-14 15:55:16 +00:00
|
|
|
ReadOptions ropts;
|
2017-10-03 03:40:23 +00:00
|
|
|
// it's going to be overwritten for sure, so no point caching data block
|
|
|
|
// containing the old version
|
|
|
|
ropts.fill_cache = false;
|
2014-01-14 15:55:16 +00:00
|
|
|
ropts.snapshot = &read_from_snapshot;
|
|
|
|
|
|
|
|
std::string prev_value;
|
|
|
|
std::string merged_value;
|
2014-02-11 01:04:44 +00:00
|
|
|
|
|
|
|
auto cf_handle = cf_mems_->GetColumnFamilyHandle();
|
2020-11-24 00:27:46 +00:00
|
|
|
Status get_status = Status::NotSupported();
|
2016-11-10 19:10:23 +00:00
|
|
|
if (db_ != nullptr && recovering_log_number_ == 0) {
|
2016-09-21 18:05:07 +00:00
|
|
|
if (cf_handle == nullptr) {
|
|
|
|
cf_handle = db_->DefaultColumnFamily();
|
|
|
|
}
|
2022-11-01 05:28:58 +00:00
|
|
|
// TODO (yanqin): fix when user-defined timestamp is enabled.
|
2020-11-24 00:27:46 +00:00
|
|
|
get_status = db_->Get(ropts, cf_handle, key, &prev_value);
|
|
|
|
}
|
|
|
|
// Intentionally overwrites the `NotFound` in `ret_status`.
|
|
|
|
if (!get_status.ok() && !get_status.IsNotFound()) {
|
|
|
|
ret_status = get_status;
|
|
|
|
} else {
|
|
|
|
ret_status = Status::OK();
|
2014-02-11 01:04:44 +00:00
|
|
|
}
|
2020-11-24 00:27:46 +00:00
|
|
|
if (ret_status.ok()) {
|
|
|
|
UpdateStatus update_status;
|
|
|
|
char* prev_buffer = const_cast<char*>(prev_value.c_str());
|
|
|
|
uint32_t prev_size = static_cast<uint32_t>(prev_value.size());
|
|
|
|
if (get_status.ok()) {
|
|
|
|
update_status = moptions->inplace_callback(prev_buffer, &prev_size,
|
|
|
|
value, &merged_value);
|
|
|
|
} else {
|
|
|
|
update_status = moptions->inplace_callback(
|
|
|
|
nullptr /* existing_value */, nullptr /* existing_value_size */,
|
|
|
|
value, &merged_value);
|
|
|
|
}
|
|
|
|
if (update_status == UpdateStatus::UPDATED_INPLACE) {
|
|
|
|
assert(get_status.ok());
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
if (kv_prot_info != nullptr) {
|
2021-09-14 20:13:36 +00:00
|
|
|
ProtectionInfoKVOS64 updated_kv_prot_info(*kv_prot_info);
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
updated_kv_prot_info.UpdateV(value,
|
|
|
|
Slice(prev_buffer, prev_size));
|
|
|
|
// prev_value is updated in-place with final value.
|
|
|
|
ret_status = mem->Add(sequence_, value_type, key,
|
|
|
|
Slice(prev_buffer, prev_size),
|
|
|
|
&updated_kv_prot_info);
|
|
|
|
} else {
|
|
|
|
ret_status = mem->Add(sequence_, value_type, key,
|
|
|
|
Slice(prev_buffer, prev_size),
|
|
|
|
nullptr /* kv_prot_info */);
|
|
|
|
}
|
2020-11-24 00:27:46 +00:00
|
|
|
if (ret_status.ok()) {
|
|
|
|
RecordTick(moptions->statistics, NUMBER_KEYS_WRITTEN);
|
|
|
|
}
|
|
|
|
} else if (update_status == UpdateStatus::UPDATED) {
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
if (kv_prot_info != nullptr) {
|
2021-09-14 20:13:36 +00:00
|
|
|
ProtectionInfoKVOS64 updated_kv_prot_info(*kv_prot_info);
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
updated_kv_prot_info.UpdateV(value, merged_value);
|
|
|
|
// merged_value contains the final value.
|
|
|
|
ret_status = mem->Add(sequence_, value_type, key,
|
|
|
|
Slice(merged_value), &updated_kv_prot_info);
|
|
|
|
} else {
|
|
|
|
// merged_value contains the final value.
|
|
|
|
ret_status =
|
|
|
|
mem->Add(sequence_, value_type, key, Slice(merged_value),
|
|
|
|
nullptr /* kv_prot_info */);
|
|
|
|
}
|
2020-11-24 00:27:46 +00:00
|
|
|
if (ret_status.ok()) {
|
|
|
|
RecordTick(moptions->statistics, NUMBER_KEYS_WRITTEN);
|
|
|
|
}
|
|
|
|
}
|
2014-01-14 15:55:16 +00:00
|
|
|
}
|
|
|
|
}
|
In-place updates for equal keys and similar sized values
Summary:
Currently for each put, a fresh memory is allocated, and a new entry is added to the memtable with a new sequence number irrespective of whether the key already exists in the memtable. This diff is an attempt to update the value inplace for existing keys. It currently handles a very simple case:
1. Key already exists in the current memtable. Does not inplace update values in immutable memtable or snapshot
2. Latest value type is a 'put' ie kTypeValue
3. New value size is less than existing value, to avoid reallocating memory
TODO: For a put of an existing key, deallocate memory take by values, for other value types till a kTypeValue is found, ie. remove kTypeMerge.
TODO: Update the transaction log, to allow consistent reload of the memtable.
Test Plan: Added a unit test verifying the inplace update. But some other unit tests broken due to invalid sequence number checks. WIll fix them next.
Reviewers: xinyaohu, sumeet, haobo, dhruba
CC: leveldb
Differential Revision: https://reviews.facebook.net/D12423
Automatic commit by arc
2013-08-19 21:12:47 +00:00
|
|
|
}
|
2020-11-24 00:27:46 +00:00
|
|
|
if (UNLIKELY(ret_status.IsTryAgain())) {
|
|
|
|
assert(seq_per_batch_);
|
|
|
|
const bool kBatchBoundary = true;
|
|
|
|
MaybeAdvanceSeq(kBatchBoundary);
|
|
|
|
} else if (ret_status.ok()) {
|
|
|
|
MaybeAdvanceSeq();
|
|
|
|
CheckMemtableFull();
|
|
|
|
}
|
2018-03-05 18:48:29 +00:00
|
|
|
// optimize for non-recovery mode
|
2020-11-24 00:27:46 +00:00
|
|
|
// If `ret_status` is `TryAgain` then the next (successful) try will add
|
|
|
|
// the key to the rebuilding transaction object. If `ret_status` is
|
|
|
|
// another non-OK `Status`, then the `rebuilding_trx_` will be thrown
|
|
|
|
// away. So we only need to add to it when `ret_status.ok()`.
|
|
|
|
if (UNLIKELY(ret_status.ok() && rebuilding_trx_ != nullptr)) {
|
2018-03-05 18:48:29 +00:00
|
|
|
assert(!write_after_commit_);
|
2021-09-14 20:13:36 +00:00
|
|
|
// TODO(ajkr): propagate `ProtectionInfoKVOS64`.
|
2024-05-22 18:06:52 +00:00
|
|
|
ret_status =
|
|
|
|
rebuild_txn_op(rebuilding_trx_, column_family_id, key, value);
|
2018-03-05 18:48:29 +00:00
|
|
|
}
|
2018-02-06 02:32:54 +00:00
|
|
|
return ret_status;
|
2011-03-18 22:37:00 +00:00
|
|
|
}
|
2014-01-14 15:55:16 +00:00
|
|
|
|
2019-02-19 21:36:04 +00:00
|
|
|
Status PutCF(uint32_t column_family_id, const Slice& key,
|
|
|
|
const Slice& value) override {
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
const auto* kv_prot_info = NextProtectionInfo();
|
2022-04-28 21:42:00 +00:00
|
|
|
Status ret_status;
|
2024-05-22 00:22:20 +00:00
|
|
|
|
2024-05-22 18:06:52 +00:00
|
|
|
auto rebuild_txn_op = [](WriteBatch* rebuilding_trx, uint32_t cf_id,
|
|
|
|
const Slice& k, const Slice& v) -> Status {
|
2024-05-22 00:22:20 +00:00
|
|
|
return WriteBatchInternal::Put(rebuilding_trx, cf_id, k, v);
|
|
|
|
};
|
|
|
|
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
if (kv_prot_info != nullptr) {
|
|
|
|
// Memtable needs seqno, doesn't need CF ID
|
|
|
|
auto mem_kv_prot_info =
|
|
|
|
kv_prot_info->StripC(column_family_id).ProtectS(sequence_);
|
2022-04-28 21:42:00 +00:00
|
|
|
ret_status = PutCFImpl(column_family_id, key, value, kTypeValue,
|
2024-05-22 18:06:52 +00:00
|
|
|
rebuild_txn_op, &mem_kv_prot_info);
|
2022-04-28 21:42:00 +00:00
|
|
|
} else {
|
|
|
|
ret_status = PutCFImpl(column_family_id, key, value, kTypeValue,
|
2024-05-22 18:06:52 +00:00
|
|
|
rebuild_txn_op, nullptr /* kv_prot_info */);
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
}
|
2022-04-28 21:42:00 +00:00
|
|
|
// TODO: this assumes that if TryAgain status is returned to the caller,
|
|
|
|
// the operation is actually tried again. The proper way to do this is to
|
|
|
|
// pass a `try_again` parameter to the operation itself and decrement
|
|
|
|
// prot_info_idx_ based on that
|
|
|
|
if (UNLIKELY(ret_status.IsTryAgain())) {
|
|
|
|
DecrementProtectionInfoIdxForTryAgain();
|
|
|
|
}
|
|
|
|
return ret_status;
|
2017-10-03 16:08:07 +00:00
|
|
|
}
|
|
|
|
|
Add initial support for TimedPut API (#12419)
Summary:
This PR adds support for `TimedPut` API. We introduced a new type `kTypeValuePreferredSeqno` for entries added to the DB via the `TimedPut` API.
The life cycle of such an entry on the write/flush/compaction paths are:
1) It is initially added to memtable as:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, write_unix_time}`
2) When it's flushed to L0 sst files, it's converted to:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, preferred_seqno}`
when we have easy access to the seqno to time mapping.
3) During compaction, if certain conditions are met, we swap in the `preferred_seqno` and the entry will become:
`<user_key, preferred_seqno, kTypeValue>: value`. This step helps fast track these entries to the cold tier if they are eligible after the sequence number swap.
On the read path:
A `kTypeValuePreferredSeqno` entry acts the same as a `kTypeValue` entry, the unix_write_time/preferred seqno part packed in value is completely ignored.
Needed follow ups:
1) The seqno to time mapping accessible in flush needs to be extended to cover the `write_unix_time` for possible `kTypeValuePreferredSeqno` entries. This also means we need to track these `write_unix_time` in memtable.
2) Compaction filter support for the new `kTypeValuePreferredSeqno` type for feature parity with other `kTypeValue` and equivalent types.
3) Stress test coverage for the feature
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12419
Test Plan: Added unit tests
Reviewed By: pdillinger
Differential Revision: D54920296
Pulled By: jowlyzhang
fbshipit-source-id: c8b43f7a7c465e569141770e93c748371ff1da9e
2024-03-14 22:44:55 +00:00
|
|
|
Status TimedPutCF(uint32_t column_family_id, const Slice& key,
|
|
|
|
const Slice& value, uint64_t unix_write_time) override {
|
|
|
|
const auto* kv_prot_info = NextProtectionInfo();
|
|
|
|
Status ret_status;
|
|
|
|
std::string value_buf;
|
|
|
|
Slice packed_value =
|
|
|
|
PackValueAndWriteTime(value, unix_write_time, &value_buf);
|
2024-05-22 00:22:20 +00:00
|
|
|
|
2024-05-22 18:06:52 +00:00
|
|
|
auto rebuild_txn_op = [](WriteBatch* /* rebuilding_trx */,
|
|
|
|
uint32_t /* cf_id */, const Slice& /* k */,
|
|
|
|
const Slice& /* v */) -> Status {
|
2024-05-22 00:22:20 +00:00
|
|
|
return Status::NotSupported();
|
|
|
|
};
|
|
|
|
|
Add initial support for TimedPut API (#12419)
Summary:
This PR adds support for `TimedPut` API. We introduced a new type `kTypeValuePreferredSeqno` for entries added to the DB via the `TimedPut` API.
The life cycle of such an entry on the write/flush/compaction paths are:
1) It is initially added to memtable as:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, write_unix_time}`
2) When it's flushed to L0 sst files, it's converted to:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, preferred_seqno}`
when we have easy access to the seqno to time mapping.
3) During compaction, if certain conditions are met, we swap in the `preferred_seqno` and the entry will become:
`<user_key, preferred_seqno, kTypeValue>: value`. This step helps fast track these entries to the cold tier if they are eligible after the sequence number swap.
On the read path:
A `kTypeValuePreferredSeqno` entry acts the same as a `kTypeValue` entry, the unix_write_time/preferred seqno part packed in value is completely ignored.
Needed follow ups:
1) The seqno to time mapping accessible in flush needs to be extended to cover the `write_unix_time` for possible `kTypeValuePreferredSeqno` entries. This also means we need to track these `write_unix_time` in memtable.
2) Compaction filter support for the new `kTypeValuePreferredSeqno` type for feature parity with other `kTypeValue` and equivalent types.
3) Stress test coverage for the feature
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12419
Test Plan: Added unit tests
Reviewed By: pdillinger
Differential Revision: D54920296
Pulled By: jowlyzhang
fbshipit-source-id: c8b43f7a7c465e569141770e93c748371ff1da9e
2024-03-14 22:44:55 +00:00
|
|
|
if (kv_prot_info != nullptr) {
|
|
|
|
auto mem_kv_prot_info =
|
|
|
|
kv_prot_info->StripC(column_family_id).ProtectS(sequence_);
|
2024-05-22 18:06:52 +00:00
|
|
|
ret_status = PutCFImpl(column_family_id, key, packed_value,
|
|
|
|
kTypeValuePreferredSeqno, rebuild_txn_op,
|
|
|
|
&mem_kv_prot_info);
|
2024-05-22 00:22:20 +00:00
|
|
|
} else {
|
|
|
|
ret_status = PutCFImpl(column_family_id, key, packed_value,
|
2024-05-22 18:06:52 +00:00
|
|
|
kTypeValuePreferredSeqno, rebuild_txn_op,
|
2024-05-22 00:22:20 +00:00
|
|
|
nullptr /* kv_prot_info */);
|
Add initial support for TimedPut API (#12419)
Summary:
This PR adds support for `TimedPut` API. We introduced a new type `kTypeValuePreferredSeqno` for entries added to the DB via the `TimedPut` API.
The life cycle of such an entry on the write/flush/compaction paths are:
1) It is initially added to memtable as:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, write_unix_time}`
2) When it's flushed to L0 sst files, it's converted to:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, preferred_seqno}`
when we have easy access to the seqno to time mapping.
3) During compaction, if certain conditions are met, we swap in the `preferred_seqno` and the entry will become:
`<user_key, preferred_seqno, kTypeValue>: value`. This step helps fast track these entries to the cold tier if they are eligible after the sequence number swap.
On the read path:
A `kTypeValuePreferredSeqno` entry acts the same as a `kTypeValue` entry, the unix_write_time/preferred seqno part packed in value is completely ignored.
Needed follow ups:
1) The seqno to time mapping accessible in flush needs to be extended to cover the `write_unix_time` for possible `kTypeValuePreferredSeqno` entries. This also means we need to track these `write_unix_time` in memtable.
2) Compaction filter support for the new `kTypeValuePreferredSeqno` type for feature parity with other `kTypeValue` and equivalent types.
3) Stress test coverage for the feature
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12419
Test Plan: Added unit tests
Reviewed By: pdillinger
Differential Revision: D54920296
Pulled By: jowlyzhang
fbshipit-source-id: c8b43f7a7c465e569141770e93c748371ff1da9e
2024-03-14 22:44:55 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// TODO: this assumes that if TryAgain status is returned to the caller,
|
|
|
|
// The operation is actually tried again. The proper way to do this is to
|
|
|
|
// pass a `try_again` parameter to the operation itself and decrement
|
|
|
|
// prot_info_idx_ based on that.
|
|
|
|
if (UNLIKELY(ret_status.IsTryAgain())) {
|
|
|
|
DecrementProtectionInfoIdxForTryAgain();
|
|
|
|
}
|
|
|
|
return ret_status;
|
|
|
|
}
|
|
|
|
|
2022-06-25 22:30:47 +00:00
|
|
|
Status PutEntityCF(uint32_t column_family_id, const Slice& key,
|
|
|
|
const Slice& value) override {
|
|
|
|
const auto* kv_prot_info = NextProtectionInfo();
|
|
|
|
|
|
|
|
Status s;
|
2024-05-22 00:22:20 +00:00
|
|
|
|
2024-05-22 18:06:52 +00:00
|
|
|
auto rebuild_txn_op = [](WriteBatch* rebuilding_trx, uint32_t cf_id,
|
|
|
|
const Slice& k, Slice entity) -> Status {
|
2024-05-22 00:22:20 +00:00
|
|
|
WideColumns columns;
|
|
|
|
const Status st = WideColumnSerialization::Deserialize(entity, columns);
|
|
|
|
if (!st.ok()) {
|
|
|
|
return st;
|
|
|
|
}
|
|
|
|
|
|
|
|
return WriteBatchInternal::PutEntity(rebuilding_trx, cf_id, k, columns);
|
|
|
|
};
|
|
|
|
|
2022-06-25 22:30:47 +00:00
|
|
|
if (kv_prot_info) {
|
|
|
|
// Memtable needs seqno, doesn't need CF ID
|
|
|
|
auto mem_kv_prot_info =
|
|
|
|
kv_prot_info->StripC(column_family_id).ProtectS(sequence_);
|
|
|
|
s = PutCFImpl(column_family_id, key, value, kTypeWideColumnEntity,
|
2024-05-22 18:06:52 +00:00
|
|
|
rebuild_txn_op, &mem_kv_prot_info);
|
2022-06-25 22:30:47 +00:00
|
|
|
} else {
|
|
|
|
s = PutCFImpl(column_family_id, key, value, kTypeWideColumnEntity,
|
2024-05-22 18:06:52 +00:00
|
|
|
rebuild_txn_op,
|
2022-06-25 22:30:47 +00:00
|
|
|
/* kv_prot_info */ nullptr);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (UNLIKELY(s.IsTryAgain())) {
|
|
|
|
DecrementProtectionInfoIdxForTryAgain();
|
|
|
|
}
|
|
|
|
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2018-03-05 21:08:17 +00:00
|
|
|
Status DeleteImpl(uint32_t /*column_family_id*/, const Slice& key,
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
const Slice& value, ValueType delete_type,
|
2021-09-14 20:13:36 +00:00
|
|
|
const ProtectionInfoKVOS64* kv_prot_info) {
|
2018-02-06 02:32:54 +00:00
|
|
|
Status ret_status;
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 18:42:56 +00:00
|
|
|
MemTable* mem = cf_mems_->GetMemTable();
|
2020-11-24 00:27:46 +00:00
|
|
|
ret_status =
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
mem->Add(sequence_, delete_type, key, value, kv_prot_info,
|
2019-09-12 23:53:31 +00:00
|
|
|
concurrent_memtable_writes_, get_post_process_info(mem),
|
|
|
|
hint_per_batch_ ? &GetHintMap()[mem] : nullptr);
|
2020-11-24 00:27:46 +00:00
|
|
|
if (UNLIKELY(ret_status.IsTryAgain())) {
|
2018-02-06 02:32:54 +00:00
|
|
|
assert(seq_per_batch_);
|
2020-11-24 00:27:46 +00:00
|
|
|
const bool kBatchBoundary = true;
|
|
|
|
MaybeAdvanceSeq(kBatchBoundary);
|
|
|
|
} else if (ret_status.ok()) {
|
|
|
|
MaybeAdvanceSeq();
|
|
|
|
CheckMemtableFull();
|
2018-02-06 02:32:54 +00:00
|
|
|
}
|
|
|
|
return ret_status;
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 18:42:56 +00:00
|
|
|
}
|
|
|
|
|
2019-02-19 21:36:04 +00:00
|
|
|
Status DeleteCF(uint32_t column_family_id, const Slice& key) override {
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
const auto* kv_prot_info = NextProtectionInfo();
|
2018-03-05 18:48:29 +00:00
|
|
|
// optimize for non-recovery mode
|
|
|
|
if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) {
|
2021-09-14 20:13:36 +00:00
|
|
|
// TODO(ajkr): propagate `ProtectionInfoKVOS64`.
|
2020-10-20 20:17:17 +00:00
|
|
|
return WriteBatchInternal::Delete(rebuilding_trx_, column_family_id, key);
|
2017-09-28 23:43:04 +00:00
|
|
|
// else insert the values to the memtable right away
|
2016-05-05 19:48:52 +00:00
|
|
|
}
|
|
|
|
|
2020-08-24 23:41:42 +00:00
|
|
|
Status ret_status;
|
|
|
|
if (UNLIKELY(!SeekToColumnFamily(column_family_id, &ret_status))) {
|
2020-11-24 00:27:46 +00:00
|
|
|
if (ret_status.ok() && rebuilding_trx_ != nullptr) {
|
2018-03-05 18:48:29 +00:00
|
|
|
assert(!write_after_commit_);
|
2018-03-08 18:18:34 +00:00
|
|
|
// The CF is probably flushed and hence no need for insert but we still
|
2018-03-05 18:48:29 +00:00
|
|
|
// need to keep track of the keys for upcoming rollback/commit.
|
2021-09-14 20:13:36 +00:00
|
|
|
// TODO(ajkr): propagate `ProtectionInfoKVOS64`.
|
2020-10-20 20:17:17 +00:00
|
|
|
ret_status =
|
|
|
|
WriteBatchInternal::Delete(rebuilding_trx_, column_family_id, key);
|
2020-11-24 00:27:46 +00:00
|
|
|
if (ret_status.ok()) {
|
|
|
|
MaybeAdvanceSeq(IsDuplicateKeySeq(column_family_id, key));
|
|
|
|
}
|
|
|
|
} else if (ret_status.ok()) {
|
|
|
|
MaybeAdvanceSeq(false /* batch_boundary */);
|
2018-03-05 18:48:29 +00:00
|
|
|
}
|
2022-04-28 21:42:00 +00:00
|
|
|
if (UNLIKELY(ret_status.IsTryAgain())) {
|
|
|
|
DecrementProtectionInfoIdxForTryAgain();
|
|
|
|
}
|
2020-08-24 23:41:42 +00:00
|
|
|
return ret_status;
|
2016-04-18 18:11:51 +00:00
|
|
|
}
|
|
|
|
|
2020-05-28 17:37:57 +00:00
|
|
|
ColumnFamilyData* cfd = cf_mems_->current();
|
|
|
|
assert(!cfd || cfd->user_comparator());
|
|
|
|
const size_t ts_sz = (cfd && cfd->user_comparator())
|
|
|
|
? cfd->user_comparator()->timestamp_size()
|
|
|
|
: 0;
|
|
|
|
const ValueType delete_type =
|
|
|
|
(0 == ts_sz) ? kTypeDeletion : kTypeDeletionWithTimestamp;
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
if (kv_prot_info != nullptr) {
|
|
|
|
auto mem_kv_prot_info =
|
|
|
|
kv_prot_info->StripC(column_family_id).ProtectS(sequence_);
|
|
|
|
mem_kv_prot_info.UpdateO(kTypeDeletion, delete_type);
|
|
|
|
ret_status = DeleteImpl(column_family_id, key, Slice(), delete_type,
|
|
|
|
&mem_kv_prot_info);
|
|
|
|
} else {
|
|
|
|
ret_status = DeleteImpl(column_family_id, key, Slice(), delete_type,
|
|
|
|
nullptr /* kv_prot_info */);
|
|
|
|
}
|
2018-03-05 18:48:29 +00:00
|
|
|
// optimize for non-recovery mode
|
2020-11-24 00:27:46 +00:00
|
|
|
// If `ret_status` is `TryAgain` then the next (successful) try will add
|
|
|
|
// the key to the rebuilding transaction object. If `ret_status` is
|
|
|
|
// another non-OK `Status`, then the `rebuilding_trx_` will be thrown
|
|
|
|
// away. So we only need to add to it when `ret_status.ok()`.
|
|
|
|
if (UNLIKELY(ret_status.ok() && rebuilding_trx_ != nullptr)) {
|
2018-03-05 18:48:29 +00:00
|
|
|
assert(!write_after_commit_);
|
2021-09-14 20:13:36 +00:00
|
|
|
// TODO(ajkr): propagate `ProtectionInfoKVOS64`.
|
2020-10-20 20:17:17 +00:00
|
|
|
ret_status =
|
|
|
|
WriteBatchInternal::Delete(rebuilding_trx_, column_family_id, key);
|
2018-03-05 18:48:29 +00:00
|
|
|
}
|
2022-04-28 21:42:00 +00:00
|
|
|
if (UNLIKELY(ret_status.IsTryAgain())) {
|
|
|
|
DecrementProtectionInfoIdxForTryAgain();
|
|
|
|
}
|
2018-03-05 18:48:29 +00:00
|
|
|
return ret_status;
|
2015-11-06 15:29:10 +00:00
|
|
|
}
|
|
|
|
|
2019-02-19 21:36:04 +00:00
|
|
|
Status SingleDeleteCF(uint32_t column_family_id, const Slice& key) override {
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
const auto* kv_prot_info = NextProtectionInfo();
|
2018-03-05 18:48:29 +00:00
|
|
|
// optimize for non-recovery mode
|
|
|
|
if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) {
|
2021-09-14 20:13:36 +00:00
|
|
|
// TODO(ajkr): propagate `ProtectionInfoKVOS64`.
|
2020-10-20 20:17:17 +00:00
|
|
|
return WriteBatchInternal::SingleDelete(rebuilding_trx_, column_family_id,
|
|
|
|
key);
|
2017-09-28 23:43:04 +00:00
|
|
|
// else insert the values to the memtable right away
|
2016-05-05 19:48:52 +00:00
|
|
|
}
|
|
|
|
|
2020-08-24 23:41:42 +00:00
|
|
|
Status ret_status;
|
|
|
|
if (UNLIKELY(!SeekToColumnFamily(column_family_id, &ret_status))) {
|
2020-11-24 00:27:46 +00:00
|
|
|
if (ret_status.ok() && rebuilding_trx_ != nullptr) {
|
2018-03-05 18:48:29 +00:00
|
|
|
assert(!write_after_commit_);
|
2018-03-08 18:18:34 +00:00
|
|
|
// The CF is probably flushed and hence no need for insert but we still
|
2018-03-05 18:48:29 +00:00
|
|
|
// need to keep track of the keys for upcoming rollback/commit.
|
2021-09-14 20:13:36 +00:00
|
|
|
// TODO(ajkr): propagate `ProtectionInfoKVOS64`.
|
2020-10-20 20:17:17 +00:00
|
|
|
ret_status = WriteBatchInternal::SingleDelete(rebuilding_trx_,
|
|
|
|
column_family_id, key);
|
2020-11-24 00:27:46 +00:00
|
|
|
if (ret_status.ok()) {
|
|
|
|
MaybeAdvanceSeq(IsDuplicateKeySeq(column_family_id, key));
|
|
|
|
}
|
|
|
|
} else if (ret_status.ok()) {
|
|
|
|
MaybeAdvanceSeq(false /* batch_boundary */);
|
2018-03-05 18:48:29 +00:00
|
|
|
}
|
2022-04-28 21:42:00 +00:00
|
|
|
if (UNLIKELY(ret_status.IsTryAgain())) {
|
|
|
|
DecrementProtectionInfoIdxForTryAgain();
|
|
|
|
}
|
2020-08-24 23:41:42 +00:00
|
|
|
return ret_status;
|
2016-04-18 18:11:51 +00:00
|
|
|
}
|
2020-11-24 00:27:46 +00:00
|
|
|
assert(ret_status.ok());
|
2016-04-18 18:11:51 +00:00
|
|
|
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
if (kv_prot_info != nullptr) {
|
|
|
|
auto mem_kv_prot_info =
|
|
|
|
kv_prot_info->StripC(column_family_id).ProtectS(sequence_);
|
|
|
|
ret_status = DeleteImpl(column_family_id, key, Slice(),
|
|
|
|
kTypeSingleDeletion, &mem_kv_prot_info);
|
|
|
|
} else {
|
|
|
|
ret_status = DeleteImpl(column_family_id, key, Slice(),
|
|
|
|
kTypeSingleDeletion, nullptr /* kv_prot_info */);
|
|
|
|
}
|
2018-03-05 18:48:29 +00:00
|
|
|
// optimize for non-recovery mode
|
2020-11-24 00:27:46 +00:00
|
|
|
// If `ret_status` is `TryAgain` then the next (successful) try will add
|
|
|
|
// the key to the rebuilding transaction object. If `ret_status` is
|
|
|
|
// another non-OK `Status`, then the `rebuilding_trx_` will be thrown
|
|
|
|
// away. So we only need to add to it when `ret_status.ok()`.
|
|
|
|
if (UNLIKELY(ret_status.ok() && rebuilding_trx_ != nullptr)) {
|
2018-03-05 18:48:29 +00:00
|
|
|
assert(!write_after_commit_);
|
2021-09-14 20:13:36 +00:00
|
|
|
// TODO(ajkr): propagate `ProtectionInfoKVOS64`.
|
2020-10-20 20:17:17 +00:00
|
|
|
ret_status = WriteBatchInternal::SingleDelete(rebuilding_trx_,
|
|
|
|
column_family_id, key);
|
2018-03-05 18:48:29 +00:00
|
|
|
}
|
2022-04-28 21:42:00 +00:00
|
|
|
if (UNLIKELY(ret_status.IsTryAgain())) {
|
|
|
|
DecrementProtectionInfoIdxForTryAgain();
|
|
|
|
}
|
2018-03-05 18:48:29 +00:00
|
|
|
return ret_status;
|
2016-08-16 15:16:04 +00:00
|
|
|
}
|
|
|
|
|
2019-02-19 21:36:04 +00:00
|
|
|
Status DeleteRangeCF(uint32_t column_family_id, const Slice& begin_key,
|
|
|
|
const Slice& end_key) override {
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
const auto* kv_prot_info = NextProtectionInfo();
|
2018-03-05 18:48:29 +00:00
|
|
|
// optimize for non-recovery mode
|
|
|
|
if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) {
|
2021-09-14 20:13:36 +00:00
|
|
|
// TODO(ajkr): propagate `ProtectionInfoKVOS64`.
|
2020-10-20 20:17:17 +00:00
|
|
|
return WriteBatchInternal::DeleteRange(rebuilding_trx_, column_family_id,
|
|
|
|
begin_key, end_key);
|
2017-09-28 23:43:04 +00:00
|
|
|
// else insert the values to the memtable right away
|
2016-08-16 15:16:04 +00:00
|
|
|
}
|
|
|
|
|
2020-08-24 23:41:42 +00:00
|
|
|
Status ret_status;
|
|
|
|
if (UNLIKELY(!SeekToColumnFamily(column_family_id, &ret_status))) {
|
2020-11-24 00:27:46 +00:00
|
|
|
if (ret_status.ok() && rebuilding_trx_ != nullptr) {
|
2018-03-05 18:48:29 +00:00
|
|
|
assert(!write_after_commit_);
|
2018-03-08 18:18:34 +00:00
|
|
|
// The CF is probably flushed and hence no need for insert but we still
|
2018-03-05 18:48:29 +00:00
|
|
|
// need to keep track of the keys for upcoming rollback/commit.
|
2021-09-14 20:13:36 +00:00
|
|
|
// TODO(ajkr): propagate `ProtectionInfoKVOS64`.
|
2020-10-20 20:17:17 +00:00
|
|
|
ret_status = WriteBatchInternal::DeleteRange(
|
|
|
|
rebuilding_trx_, column_family_id, begin_key, end_key);
|
2020-11-24 00:27:46 +00:00
|
|
|
if (ret_status.ok()) {
|
|
|
|
MaybeAdvanceSeq(IsDuplicateKeySeq(column_family_id, begin_key));
|
|
|
|
}
|
|
|
|
} else if (ret_status.ok()) {
|
|
|
|
MaybeAdvanceSeq(false /* batch_boundary */);
|
2018-03-05 18:48:29 +00:00
|
|
|
}
|
2022-04-28 21:42:00 +00:00
|
|
|
if (UNLIKELY(ret_status.IsTryAgain())) {
|
|
|
|
DecrementProtectionInfoIdxForTryAgain();
|
|
|
|
}
|
2020-08-24 23:41:42 +00:00
|
|
|
return ret_status;
|
2016-08-16 15:16:04 +00:00
|
|
|
}
|
2020-11-24 00:27:46 +00:00
|
|
|
assert(ret_status.ok());
|
|
|
|
|
2016-11-15 23:18:56 +00:00
|
|
|
if (db_ != nullptr) {
|
|
|
|
auto cf_handle = cf_mems_->GetColumnFamilyHandle();
|
|
|
|
if (cf_handle == nullptr) {
|
|
|
|
cf_handle = db_->DefaultColumnFamily();
|
|
|
|
}
|
2020-07-03 02:24:25 +00:00
|
|
|
auto* cfd =
|
|
|
|
static_cast_with_check<ColumnFamilyHandleImpl>(cf_handle)->cfd();
|
2016-11-15 23:18:56 +00:00
|
|
|
if (!cfd->is_delete_range_supported()) {
|
2020-11-24 00:27:46 +00:00
|
|
|
// TODO(ajkr): refactor `SeekToColumnFamily()` so it returns a `Status`.
|
|
|
|
ret_status.PermitUncheckedError();
|
2016-11-15 23:18:56 +00:00
|
|
|
return Status::NotSupported(
|
2024-10-17 21:13:20 +00:00
|
|
|
std::string("CF " + cfd->GetName() +
|
|
|
|
" reports it does not support DeleteRange"));
|
2016-11-15 23:18:56 +00:00
|
|
|
}
|
2022-09-30 23:13:03 +00:00
|
|
|
int cmp =
|
|
|
|
cfd->user_comparator()->CompareWithoutTimestamp(begin_key, end_key);
|
2020-05-07 18:53:32 +00:00
|
|
|
if (cmp > 0) {
|
2020-11-24 00:27:46 +00:00
|
|
|
// TODO(ajkr): refactor `SeekToColumnFamily()` so it returns a `Status`.
|
|
|
|
ret_status.PermitUncheckedError();
|
2020-05-07 18:53:32 +00:00
|
|
|
// It's an empty range where endpoints appear mistaken. Don't bother
|
|
|
|
// applying it to the DB, and return an error to the user.
|
|
|
|
return Status::InvalidArgument("end key comes before start key");
|
|
|
|
} else if (cmp == 0) {
|
2020-11-24 00:27:46 +00:00
|
|
|
// TODO(ajkr): refactor `SeekToColumnFamily()` so it returns a `Status`.
|
|
|
|
ret_status.PermitUncheckedError();
|
2020-05-07 18:53:32 +00:00
|
|
|
// It's an empty range. Don't bother applying it to the DB.
|
|
|
|
return Status::OK();
|
|
|
|
}
|
2016-11-15 23:18:56 +00:00
|
|
|
}
|
2016-08-16 15:16:04 +00:00
|
|
|
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
if (kv_prot_info != nullptr) {
|
|
|
|
auto mem_kv_prot_info =
|
|
|
|
kv_prot_info->StripC(column_family_id).ProtectS(sequence_);
|
|
|
|
ret_status = DeleteImpl(column_family_id, begin_key, end_key,
|
|
|
|
kTypeRangeDeletion, &mem_kv_prot_info);
|
|
|
|
} else {
|
|
|
|
ret_status = DeleteImpl(column_family_id, begin_key, end_key,
|
|
|
|
kTypeRangeDeletion, nullptr /* kv_prot_info */);
|
|
|
|
}
|
2018-03-05 18:48:29 +00:00
|
|
|
// optimize for non-recovery mode
|
2020-11-24 00:27:46 +00:00
|
|
|
// If `ret_status` is `TryAgain` then the next (successful) try will add
|
|
|
|
// the key to the rebuilding transaction object. If `ret_status` is
|
|
|
|
// another non-OK `Status`, then the `rebuilding_trx_` will be thrown
|
|
|
|
// away. So we only need to add to it when `ret_status.ok()`.
|
2018-03-05 18:48:29 +00:00
|
|
|
if (UNLIKELY(!ret_status.IsTryAgain() && rebuilding_trx_ != nullptr)) {
|
|
|
|
assert(!write_after_commit_);
|
2021-09-14 20:13:36 +00:00
|
|
|
// TODO(ajkr): propagate `ProtectionInfoKVOS64`.
|
2020-10-20 20:17:17 +00:00
|
|
|
ret_status = WriteBatchInternal::DeleteRange(
|
|
|
|
rebuilding_trx_, column_family_id, begin_key, end_key);
|
2018-03-05 18:48:29 +00:00
|
|
|
}
|
2022-04-28 21:42:00 +00:00
|
|
|
if (UNLIKELY(ret_status.IsTryAgain())) {
|
|
|
|
DecrementProtectionInfoIdxForTryAgain();
|
|
|
|
}
|
2018-03-05 18:48:29 +00:00
|
|
|
return ret_status;
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 18:42:56 +00:00
|
|
|
}
|
|
|
|
|
2019-02-19 21:36:04 +00:00
|
|
|
Status MergeCF(uint32_t column_family_id, const Slice& key,
|
|
|
|
const Slice& value) override {
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
const auto* kv_prot_info = NextProtectionInfo();
|
2018-03-05 18:48:29 +00:00
|
|
|
// optimize for non-recovery mode
|
|
|
|
if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) {
|
2021-09-14 20:13:36 +00:00
|
|
|
// TODO(ajkr): propagate `ProtectionInfoKVOS64`.
|
2020-10-20 20:17:17 +00:00
|
|
|
return WriteBatchInternal::Merge(rebuilding_trx_, column_family_id, key,
|
|
|
|
value);
|
2017-09-28 23:43:04 +00:00
|
|
|
// else insert the values to the memtable right away
|
2016-05-05 19:48:52 +00:00
|
|
|
}
|
|
|
|
|
2020-08-24 23:41:42 +00:00
|
|
|
Status ret_status;
|
|
|
|
if (UNLIKELY(!SeekToColumnFamily(column_family_id, &ret_status))) {
|
2020-11-24 00:27:46 +00:00
|
|
|
if (ret_status.ok() && rebuilding_trx_ != nullptr) {
|
2018-03-05 18:48:29 +00:00
|
|
|
assert(!write_after_commit_);
|
2018-03-08 18:18:34 +00:00
|
|
|
// The CF is probably flushed and hence no need for insert but we still
|
2018-03-05 18:48:29 +00:00
|
|
|
// need to keep track of the keys for upcoming rollback/commit.
|
2021-09-14 20:13:36 +00:00
|
|
|
// TODO(ajkr): propagate `ProtectionInfoKVOS64`.
|
2020-10-20 20:17:17 +00:00
|
|
|
ret_status = WriteBatchInternal::Merge(rebuilding_trx_,
|
|
|
|
column_family_id, key, value);
|
2020-11-24 00:27:46 +00:00
|
|
|
if (ret_status.ok()) {
|
|
|
|
MaybeAdvanceSeq(IsDuplicateKeySeq(column_family_id, key));
|
|
|
|
}
|
|
|
|
} else if (ret_status.ok()) {
|
|
|
|
MaybeAdvanceSeq(false /* batch_boundary */);
|
2018-03-05 18:48:29 +00:00
|
|
|
}
|
2022-04-28 21:42:00 +00:00
|
|
|
if (UNLIKELY(ret_status.IsTryAgain())) {
|
|
|
|
DecrementProtectionInfoIdxForTryAgain();
|
|
|
|
}
|
2020-08-24 23:41:42 +00:00
|
|
|
return ret_status;
|
2014-01-28 19:05:04 +00:00
|
|
|
}
|
2020-11-24 00:27:46 +00:00
|
|
|
assert(ret_status.ok());
|
2016-05-05 19:48:52 +00:00
|
|
|
|
2014-02-06 00:02:48 +00:00
|
|
|
MemTable* mem = cf_mems_->GetMemTable();
|
2017-11-03 05:16:23 +00:00
|
|
|
auto* moptions = mem->GetImmutableMemTableOptions();
|
2020-11-17 01:58:59 +00:00
|
|
|
if (moptions->merge_operator == nullptr) {
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"Merge requires `ColumnFamilyOptions::merge_operator != nullptr`");
|
|
|
|
}
|
2014-01-11 01:33:56 +00:00
|
|
|
bool perform_merge = false;
|
2019-05-14 00:43:47 +00:00
|
|
|
assert(!concurrent_memtable_writes_ ||
|
|
|
|
moptions->max_successive_merges == 0);
|
2014-01-11 01:33:56 +00:00
|
|
|
|
2016-09-21 18:05:07 +00:00
|
|
|
// If we pass DB through and options.max_successive_merges is hit
|
|
|
|
// during recovery, Get() will be issued which will try to acquire
|
|
|
|
// DB mutex and cause deadlock, as DB mutex is already held.
|
|
|
|
// So we disable merge in recovery
|
|
|
|
if (moptions->max_successive_merges > 0 && db_ != nullptr &&
|
|
|
|
recovering_log_number_ == 0) {
|
2019-05-14 00:43:47 +00:00
|
|
|
assert(!concurrent_memtable_writes_);
|
2014-01-11 01:33:56 +00:00
|
|
|
LookupKey lkey(key, sequence_);
|
|
|
|
|
|
|
|
// Count the number of successive merges at the head
|
2024-04-17 19:11:24 +00:00
|
|
|
// of the key in the memtable. Limit the count to the threshold for
|
|
|
|
// triggering merge to prevent unnecessary counting overhead.
|
|
|
|
size_t num_merges = mem->CountSuccessiveMergeEntries(
|
|
|
|
lkey, moptions->max_successive_merges /* limit */);
|
2014-01-11 01:33:56 +00:00
|
|
|
|
2014-09-09 01:46:52 +00:00
|
|
|
if (num_merges >= moptions->max_successive_merges) {
|
2014-01-11 01:33:56 +00:00
|
|
|
perform_merge = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (perform_merge) {
|
2023-10-02 23:25:25 +00:00
|
|
|
// 1) Get the existing value. Use the wide column APIs to make sure we
|
|
|
|
// don't lose any columns in the process.
|
|
|
|
PinnableWideColumns existing;
|
2014-01-11 01:33:56 +00:00
|
|
|
|
|
|
|
// Pass in the sequence number so that we also include previous merge
|
|
|
|
// operations in the same batch.
|
|
|
|
SnapshotImpl read_from_snapshot;
|
|
|
|
read_from_snapshot.number_ = sequence_;
|
2023-10-02 23:25:25 +00:00
|
|
|
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
// TODO: plumb Env::IOActivity, Env::IOPriority
|
2014-01-11 01:33:56 +00:00
|
|
|
ReadOptions read_options;
|
2024-02-21 21:15:27 +00:00
|
|
|
if (!moptions->strict_max_successive_merges) {
|
|
|
|
// Blocking the write path with read I/O is typically unacceptable, so
|
|
|
|
// only do this merge when the operands are all found in memory.
|
|
|
|
read_options.read_tier = kBlockCacheTier;
|
|
|
|
}
|
2014-01-11 01:33:56 +00:00
|
|
|
read_options.snapshot = &read_from_snapshot;
|
|
|
|
|
2014-02-11 01:04:44 +00:00
|
|
|
auto cf_handle = cf_mems_->GetColumnFamilyHandle();
|
|
|
|
if (cf_handle == nullptr) {
|
|
|
|
cf_handle = db_->DefaultColumnFamily();
|
|
|
|
}
|
2023-10-02 23:25:25 +00:00
|
|
|
|
|
|
|
Status get_status =
|
|
|
|
db_->GetEntity(read_options, cf_handle, key, &existing);
|
2020-11-24 00:27:46 +00:00
|
|
|
if (!get_status.ok()) {
|
|
|
|
// Failed to read a key we know exists. Store the delta in memtable.
|
2014-07-28 19:05:36 +00:00
|
|
|
perform_merge = false;
|
2014-01-11 01:33:56 +00:00
|
|
|
} else {
|
2020-11-24 00:27:46 +00:00
|
|
|
// 2) Apply this merge
|
|
|
|
auto merge_operator = moptions->merge_operator;
|
|
|
|
assert(merge_operator);
|
|
|
|
|
2023-10-02 23:25:25 +00:00
|
|
|
const auto& columns = existing.columns();
|
|
|
|
|
|
|
|
Status merge_status;
|
2020-11-24 00:27:46 +00:00
|
|
|
std::string new_value;
|
Integrate FullMergeV3 into the query and compaction paths (#11858)
Summary:
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11858
The patch builds on https://github.com/facebook/rocksdb/pull/11807 and integrates the `FullMergeV3` API into the read and compaction code paths by updating and extending the logic in `MergeHelper`.
In particular, when it comes to merge inputs, the existing `TimedFullMergeWithEntity` is folded into `TimedFullMerge`, since wide-column base values are now handled the same way as plain base values (or no base values for that matter), e.g. they are passed directly to the `MergeOperator`. On the other hand, there is some new differentiation on the output side. Namely, there are now two sets of `TimedFullMerge` variants: one set for contexts where the complete merge result and its value type are needed (used by iterators and compactions), and another set where the merge result is needed in a form determined by the client (used by the point lookup APIs, where e.g. for `Get` we have to extract the value of the default column of any wide-column results).
Implementation-wise, the two sets of overloads use different visitors to process the `std::variant` produced by `FullMergeV3`. This has the benefit of eliminating some repeated code e.g. in the point lookup paths, since `TimedFullMerge` now populates the application's result object (`PinnableSlice`/`string` or `PinnableWideColumns`) directly. Moreover, within each set of variants, there is a separate overload for the no base value/plain base value/wide-column base value cases, which eliminates some repeated branching w/r/t to the type of the base value if any.
Reviewed By: jaykorean
Differential Revision: D49352562
fbshipit-source-id: c2fb9853dba3fbbc6918665bde4195c4ea150a0c
2023-09-20 00:27:04 +00:00
|
|
|
ValueType new_value_type;
|
2023-10-02 23:25:25 +00:00
|
|
|
|
|
|
|
if (WideColumnsHelper::HasDefaultColumnOnly(columns)) {
|
|
|
|
// `op_failure_scope` (an output parameter) is not provided (set to
|
|
|
|
// nullptr) since a failure must be propagated regardless of its
|
|
|
|
// value.
|
|
|
|
merge_status = MergeHelper::TimedFullMerge(
|
|
|
|
merge_operator, key, MergeHelper::kPlainBaseValue,
|
|
|
|
WideColumnsHelper::GetDefaultColumn(columns), {value},
|
|
|
|
moptions->info_log, moptions->statistics,
|
|
|
|
SystemClock::Default().get(),
|
2023-12-05 22:07:42 +00:00
|
|
|
/* update_num_ops_stats */ false, /* op_failure_scope */ nullptr,
|
|
|
|
&new_value, /* result_operand */ nullptr, &new_value_type);
|
2023-10-02 23:25:25 +00:00
|
|
|
} else {
|
|
|
|
// `op_failure_scope` (an output parameter) is not provided (set to
|
|
|
|
// nullptr) since a failure must be propagated regardless of its
|
|
|
|
// value.
|
|
|
|
merge_status = MergeHelper::TimedFullMerge(
|
|
|
|
merge_operator, key, MergeHelper::kWideBaseValue, columns,
|
|
|
|
{value}, moptions->info_log, moptions->statistics,
|
|
|
|
SystemClock::Default().get(),
|
2023-12-05 22:07:42 +00:00
|
|
|
/* update_num_ops_stats */ false, /* op_failure_scope */ nullptr,
|
|
|
|
&new_value, /* result_operand */ nullptr, &new_value_type);
|
2023-10-02 23:25:25 +00:00
|
|
|
}
|
2020-11-24 00:27:46 +00:00
|
|
|
|
|
|
|
if (!merge_status.ok()) {
|
|
|
|
// Failed to merge!
|
|
|
|
// Store the delta in memtable
|
|
|
|
perform_merge = false;
|
|
|
|
} else {
|
|
|
|
// 3) Add value to memtable
|
|
|
|
assert(!concurrent_memtable_writes_);
|
2023-10-02 23:25:25 +00:00
|
|
|
assert(new_value_type == kTypeValue ||
|
|
|
|
new_value_type == kTypeWideColumnEntity);
|
|
|
|
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
if (kv_prot_info != nullptr) {
|
|
|
|
auto merged_kv_prot_info =
|
|
|
|
kv_prot_info->StripC(column_family_id).ProtectS(sequence_);
|
|
|
|
merged_kv_prot_info.UpdateV(value, new_value);
|
Integrate FullMergeV3 into the query and compaction paths (#11858)
Summary:
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11858
The patch builds on https://github.com/facebook/rocksdb/pull/11807 and integrates the `FullMergeV3` API into the read and compaction code paths by updating and extending the logic in `MergeHelper`.
In particular, when it comes to merge inputs, the existing `TimedFullMergeWithEntity` is folded into `TimedFullMerge`, since wide-column base values are now handled the same way as plain base values (or no base values for that matter), e.g. they are passed directly to the `MergeOperator`. On the other hand, there is some new differentiation on the output side. Namely, there are now two sets of `TimedFullMerge` variants: one set for contexts where the complete merge result and its value type are needed (used by iterators and compactions), and another set where the merge result is needed in a form determined by the client (used by the point lookup APIs, where e.g. for `Get` we have to extract the value of the default column of any wide-column results).
Implementation-wise, the two sets of overloads use different visitors to process the `std::variant` produced by `FullMergeV3`. This has the benefit of eliminating some repeated code e.g. in the point lookup paths, since `TimedFullMerge` now populates the application's result object (`PinnableSlice`/`string` or `PinnableWideColumns`) directly. Moreover, within each set of variants, there is a separate overload for the no base value/plain base value/wide-column base value cases, which eliminates some repeated branching w/r/t to the type of the base value if any.
Reviewed By: jaykorean
Differential Revision: D49352562
fbshipit-source-id: c2fb9853dba3fbbc6918665bde4195c4ea150a0c
2023-09-20 00:27:04 +00:00
|
|
|
merged_kv_prot_info.UpdateO(kTypeMerge, new_value_type);
|
|
|
|
ret_status = mem->Add(sequence_, new_value_type, key, new_value,
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
&merged_kv_prot_info);
|
|
|
|
} else {
|
Integrate FullMergeV3 into the query and compaction paths (#11858)
Summary:
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11858
The patch builds on https://github.com/facebook/rocksdb/pull/11807 and integrates the `FullMergeV3` API into the read and compaction code paths by updating and extending the logic in `MergeHelper`.
In particular, when it comes to merge inputs, the existing `TimedFullMergeWithEntity` is folded into `TimedFullMerge`, since wide-column base values are now handled the same way as plain base values (or no base values for that matter), e.g. they are passed directly to the `MergeOperator`. On the other hand, there is some new differentiation on the output side. Namely, there are now two sets of `TimedFullMerge` variants: one set for contexts where the complete merge result and its value type are needed (used by iterators and compactions), and another set where the merge result is needed in a form determined by the client (used by the point lookup APIs, where e.g. for `Get` we have to extract the value of the default column of any wide-column results).
Implementation-wise, the two sets of overloads use different visitors to process the `std::variant` produced by `FullMergeV3`. This has the benefit of eliminating some repeated code e.g. in the point lookup paths, since `TimedFullMerge` now populates the application's result object (`PinnableSlice`/`string` or `PinnableWideColumns`) directly. Moreover, within each set of variants, there is a separate overload for the no base value/plain base value/wide-column base value cases, which eliminates some repeated branching w/r/t to the type of the base value if any.
Reviewed By: jaykorean
Differential Revision: D49352562
fbshipit-source-id: c2fb9853dba3fbbc6918665bde4195c4ea150a0c
2023-09-20 00:27:04 +00:00
|
|
|
ret_status = mem->Add(sequence_, new_value_type, key, new_value,
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
nullptr /* kv_prot_info */);
|
|
|
|
}
|
2018-02-06 02:32:54 +00:00
|
|
|
}
|
2014-01-11 01:33:56 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!perform_merge) {
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
assert(ret_status.ok());
|
2020-11-24 00:27:46 +00:00
|
|
|
// Add merge operand to memtable
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
if (kv_prot_info != nullptr) {
|
|
|
|
auto mem_kv_prot_info =
|
|
|
|
kv_prot_info->StripC(column_family_id).ProtectS(sequence_);
|
|
|
|
ret_status =
|
|
|
|
mem->Add(sequence_, kTypeMerge, key, value, &mem_kv_prot_info,
|
|
|
|
concurrent_memtable_writes_, get_post_process_info(mem));
|
|
|
|
} else {
|
|
|
|
ret_status = mem->Add(
|
|
|
|
sequence_, kTypeMerge, key, value, nullptr /* kv_prot_info */,
|
|
|
|
concurrent_memtable_writes_, get_post_process_info(mem));
|
|
|
|
}
|
2014-01-11 01:33:56 +00:00
|
|
|
}
|
|
|
|
|
2020-11-24 00:27:46 +00:00
|
|
|
if (UNLIKELY(ret_status.IsTryAgain())) {
|
|
|
|
assert(seq_per_batch_);
|
|
|
|
const bool kBatchBoundary = true;
|
|
|
|
MaybeAdvanceSeq(kBatchBoundary);
|
|
|
|
} else if (ret_status.ok()) {
|
|
|
|
MaybeAdvanceSeq();
|
|
|
|
CheckMemtableFull();
|
|
|
|
}
|
2018-03-05 18:48:29 +00:00
|
|
|
// optimize for non-recovery mode
|
2020-11-24 00:27:46 +00:00
|
|
|
// If `ret_status` is `TryAgain` then the next (successful) try will add
|
|
|
|
// the key to the rebuilding transaction object. If `ret_status` is
|
|
|
|
// another non-OK `Status`, then the `rebuilding_trx_` will be thrown
|
|
|
|
// away. So we only need to add to it when `ret_status.ok()`.
|
|
|
|
if (UNLIKELY(ret_status.ok() && rebuilding_trx_ != nullptr)) {
|
2018-03-05 18:48:29 +00:00
|
|
|
assert(!write_after_commit_);
|
2021-09-14 20:13:36 +00:00
|
|
|
// TODO(ajkr): propagate `ProtectionInfoKVOS64`.
|
2020-10-20 20:17:17 +00:00
|
|
|
ret_status = WriteBatchInternal::Merge(rebuilding_trx_, column_family_id,
|
|
|
|
key, value);
|
2018-03-05 18:48:29 +00:00
|
|
|
}
|
2022-04-28 21:42:00 +00:00
|
|
|
if (UNLIKELY(ret_status.IsTryAgain())) {
|
|
|
|
DecrementProtectionInfoIdxForTryAgain();
|
|
|
|
}
|
2018-02-06 02:32:54 +00:00
|
|
|
return ret_status;
|
2013-03-21 22:59:47 +00:00
|
|
|
}
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-14 23:59:07 +00:00
|
|
|
|
2019-02-19 21:36:04 +00:00
|
|
|
Status PutBlobIndexCF(uint32_t column_family_id, const Slice& key,
|
|
|
|
const Slice& value) override {
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
const auto* kv_prot_info = NextProtectionInfo();
|
2022-04-28 21:42:00 +00:00
|
|
|
Status ret_status;
|
2024-05-22 00:22:20 +00:00
|
|
|
|
2024-05-22 18:06:52 +00:00
|
|
|
auto rebuild_txn_op = [](WriteBatch* /* rebuilding_trx */,
|
|
|
|
uint32_t /* cf_id */, const Slice& /* k */,
|
|
|
|
const Slice& /* v */) -> Status {
|
2024-05-22 00:22:20 +00:00
|
|
|
return Status::NotSupported();
|
|
|
|
};
|
|
|
|
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
if (kv_prot_info != nullptr) {
|
|
|
|
// Memtable needs seqno, doesn't need CF ID
|
|
|
|
auto mem_kv_prot_info =
|
|
|
|
kv_prot_info->StripC(column_family_id).ProtectS(sequence_);
|
|
|
|
// Same as PutCF except for value type.
|
2022-04-28 21:42:00 +00:00
|
|
|
ret_status = PutCFImpl(column_family_id, key, value, kTypeBlobIndex,
|
2024-05-22 18:06:52 +00:00
|
|
|
rebuild_txn_op, &mem_kv_prot_info);
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
} else {
|
2022-04-28 21:42:00 +00:00
|
|
|
ret_status = PutCFImpl(column_family_id, key, value, kTypeBlobIndex,
|
2024-05-22 18:06:52 +00:00
|
|
|
rebuild_txn_op, nullptr /* kv_prot_info */);
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
}
|
2022-04-28 21:42:00 +00:00
|
|
|
if (UNLIKELY(ret_status.IsTryAgain())) {
|
|
|
|
DecrementProtectionInfoIdxForTryAgain();
|
|
|
|
}
|
|
|
|
return ret_status;
|
2017-10-03 16:08:07 +00:00
|
|
|
}
|
|
|
|
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-14 23:59:07 +00:00
|
|
|
void CheckMemtableFull() {
|
|
|
|
if (flush_scheduler_ != nullptr) {
|
|
|
|
auto* cfd = cf_mems_->current();
|
|
|
|
assert(cfd != nullptr);
|
|
|
|
if (cfd->mem()->ShouldScheduleFlush() &&
|
|
|
|
cfd->mem()->MarkFlushScheduled()) {
|
|
|
|
// MarkFlushScheduled only returns true if we are the one that
|
|
|
|
// should take action, so no need to dedup further
|
Refactor trimming logic for immutable memtables (#5022)
Summary:
MyRocks currently sets `max_write_buffer_number_to_maintain` in order to maintain enough history for transaction conflict checking. The effectiveness of this approach depends on the size of memtables. When memtables are small, it may not keep enough history; when memtables are large, this may consume too much memory.
We are proposing a new way to configure memtable list history: by limiting the memory usage of immutable memtables. The new option is `max_write_buffer_size_to_maintain` and it will take precedence over the old `max_write_buffer_number_to_maintain` if they are both set to non-zero values. The new option accounts for the total memory usage of flushed immutable memtables and mutable memtable. When the total usage exceeds the limit, RocksDB may start dropping immutable memtables (which is also called trimming history), starting from the oldest one.
The semantics of the old option actually works both as an upper bound and lower bound. History trimming will start if number of immutable memtables exceeds the limit, but it will never go below (limit-1) due to history trimming.
In order the mimic the behavior with the new option, history trimming will stop if dropping the next immutable memtable causes the total memory usage go below the size limit. For example, assuming the size limit is set to 64MB, and there are 3 immutable memtables with sizes of 20, 30, 30. Although the total memory usage is 80MB > 64MB, dropping the oldest memtable will reduce the memory usage to 60MB < 64MB, so in this case no memtable will be dropped.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5022
Differential Revision: D14394062
Pulled By: miasantreble
fbshipit-source-id: 60457a509c6af89d0993f988c9b5c2aa9e45f5c5
2019-08-23 20:54:09 +00:00
|
|
|
flush_scheduler_->ScheduleWork(cfd);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// check if memtable_list size exceeds max_write_buffer_size_to_maintain
|
|
|
|
if (trim_history_scheduler_ != nullptr) {
|
|
|
|
auto* cfd = cf_mems_->current();
|
2019-12-14 03:09:53 +00:00
|
|
|
|
|
|
|
assert(cfd);
|
|
|
|
assert(cfd->ioptions());
|
|
|
|
|
|
|
|
const size_t size_to_maintain = static_cast<size_t>(
|
|
|
|
cfd->ioptions()->max_write_buffer_size_to_maintain);
|
|
|
|
|
|
|
|
if (size_to_maintain > 0) {
|
|
|
|
MemTableList* const imm = cfd->imm();
|
|
|
|
assert(imm);
|
|
|
|
|
2019-12-16 21:13:42 +00:00
|
|
|
if (imm->HasHistory()) {
|
2019-12-14 03:09:53 +00:00
|
|
|
const MemTable* const mem = cfd->mem();
|
|
|
|
assert(mem);
|
|
|
|
|
2021-12-02 19:44:29 +00:00
|
|
|
if (mem->MemoryAllocatedBytes() +
|
|
|
|
imm->MemoryAllocatedBytesExcludingLast() >=
|
2019-12-14 03:09:53 +00:00
|
|
|
size_to_maintain &&
|
|
|
|
imm->MarkTrimHistoryNeeded()) {
|
|
|
|
trim_history_scheduler_->ScheduleWork(cfd);
|
|
|
|
}
|
|
|
|
}
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-14 23:59:07 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2016-04-18 18:11:51 +00:00
|
|
|
|
2018-07-07 00:17:36 +00:00
|
|
|
// The write batch handler calls MarkBeginPrepare with unprepare set to true
|
|
|
|
// if it encounters the kTypeBeginUnprepareXID marker.
|
|
|
|
Status MarkBeginPrepare(bool unprepare) override {
|
2016-04-18 18:11:51 +00:00
|
|
|
assert(rebuilding_trx_ == nullptr);
|
|
|
|
assert(db_);
|
|
|
|
|
|
|
|
if (recovering_log_number_ != 0) {
|
2022-03-17 02:00:04 +00:00
|
|
|
db_->mutex()->AssertHeld();
|
2016-04-18 18:11:51 +00:00
|
|
|
// during recovery we rebuild a hollow transaction
|
|
|
|
// from all encountered prepare sections of the wal
|
|
|
|
if (db_->allow_2pc() == false) {
|
|
|
|
return Status::NotSupported(
|
|
|
|
"WAL contains prepared transactions. Open with "
|
|
|
|
"TransactionDB::Open().");
|
|
|
|
}
|
|
|
|
|
|
|
|
// we are now iterating through a prepared section
|
|
|
|
rebuilding_trx_ = new WriteBatch();
|
2017-09-28 23:43:04 +00:00
|
|
|
rebuilding_trx_seq_ = sequence_;
|
2020-02-14 02:50:01 +00:00
|
|
|
// Verify that we have matching MarkBeginPrepare/MarkEndPrepare markers.
|
|
|
|
// unprepared_batch_ should be false because it is false by default, and
|
|
|
|
// gets reset to false in MarkEndPrepare.
|
2018-07-07 00:17:36 +00:00
|
|
|
assert(!unprepared_batch_);
|
|
|
|
unprepared_batch_ = unprepare;
|
|
|
|
|
Ignore stale logs while restarting DBs
Summary:
Stale log files can be deleted out of order. This can happen for various reasons. One of the reason is that no data is ever inserted to a column family and we have an optimization to update its log number, but not all the old log files are cleaned up (the case shown in the unit tests added). It can also happen when we simply delete multiple log files out of order.
This causes data corruption because we simply increase seqID after processing the next row and we may end up with writing data with smaller seqID than what is already flushed to memtables.
In DB recovery, for the oldest files we are replaying, if there it contains no data for any column family, we ignore the sequence IDs in the file.
Test Plan: Add two unit tests that fail without the fix.
Reviewers: IslamAbdelRahman, igor, yiwu
Reviewed By: yiwu
Subscribers: hermanlee4, yoshinorim, leveldb, andrewkr, dhruba
Differential Revision: https://reviews.facebook.net/D60891
2016-07-19 18:48:00 +00:00
|
|
|
if (has_valid_writes_ != nullptr) {
|
|
|
|
*has_valid_writes_ = true;
|
|
|
|
}
|
2016-04-18 18:11:51 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
Status MarkEndPrepare(const Slice& name) override {
|
|
|
|
assert(db_);
|
|
|
|
assert((rebuilding_trx_ != nullptr) == (recovering_log_number_ != 0));
|
|
|
|
|
|
|
|
if (recovering_log_number_ != 0) {
|
2022-03-17 02:00:04 +00:00
|
|
|
db_->mutex()->AssertHeld();
|
2016-04-18 18:11:51 +00:00
|
|
|
assert(db_->allow_2pc());
|
2018-03-05 18:48:29 +00:00
|
|
|
size_t batch_cnt =
|
|
|
|
write_after_commit_
|
|
|
|
? 0 // 0 will disable further checks
|
|
|
|
: static_cast<size_t>(sequence_ - rebuilding_trx_seq_ + 1);
|
2016-04-18 18:11:51 +00:00
|
|
|
db_->InsertRecoveredTransaction(recovering_log_number_, name.ToString(),
|
2018-03-05 18:48:29 +00:00
|
|
|
rebuilding_trx_, rebuilding_trx_seq_,
|
2018-07-07 00:17:36 +00:00
|
|
|
batch_cnt, unprepared_batch_);
|
2020-02-14 02:50:01 +00:00
|
|
|
unprepared_batch_ = false;
|
2016-04-18 18:11:51 +00:00
|
|
|
rebuilding_trx_ = nullptr;
|
|
|
|
} else {
|
|
|
|
assert(rebuilding_trx_ == nullptr);
|
|
|
|
}
|
2017-09-18 21:36:53 +00:00
|
|
|
const bool batch_boundry = true;
|
|
|
|
MaybeAdvanceSeq(batch_boundry);
|
|
|
|
|
|
|
|
return Status::OK();
|
|
|
|
}
|
2016-04-18 18:11:51 +00:00
|
|
|
|
2017-09-28 23:43:04 +00:00
|
|
|
Status MarkNoop(bool empty_batch) override {
|
2022-03-17 02:00:04 +00:00
|
|
|
if (recovering_log_number_ != 0) {
|
|
|
|
db_->mutex()->AssertHeld();
|
|
|
|
}
|
2017-09-18 21:36:53 +00:00
|
|
|
// A hack in pessimistic transaction could result into a noop at the start
|
|
|
|
// of the write batch, that should be ignored.
|
2017-09-28 23:43:04 +00:00
|
|
|
if (!empty_batch) {
|
2017-09-18 21:36:53 +00:00
|
|
|
// In the absence of Prepare markers, a kTypeNoop tag indicates the end of
|
|
|
|
// a batch. This happens when write batch commits skipping the prepare
|
|
|
|
// phase.
|
|
|
|
const bool batch_boundry = true;
|
|
|
|
MaybeAdvanceSeq(batch_boundry);
|
|
|
|
}
|
2016-04-18 18:11:51 +00:00
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
Status MarkCommit(const Slice& name) override {
|
|
|
|
assert(db_);
|
|
|
|
|
|
|
|
Status s;
|
|
|
|
|
|
|
|
if (recovering_log_number_ != 0) {
|
2021-12-10 19:03:39 +00:00
|
|
|
// We must hold db mutex in recovery.
|
|
|
|
db_->mutex()->AssertHeld();
|
2016-04-18 18:11:51 +00:00
|
|
|
// in recovery when we encounter a commit marker
|
|
|
|
// we lookup this transaction in our set of rebuilt transactions
|
|
|
|
// and commit.
|
|
|
|
auto trx = db_->GetRecoveredTransaction(name.ToString());
|
|
|
|
|
2018-03-08 18:18:34 +00:00
|
|
|
// the log containing the prepared section may have
|
2016-04-18 18:11:51 +00:00
|
|
|
// been released in the last incarnation because the
|
|
|
|
// data was flushed to L0
|
|
|
|
if (trx != nullptr) {
|
|
|
|
// at this point individual CF lognumbers will prevent
|
|
|
|
// duplicate re-insertion of values.
|
|
|
|
assert(log_number_ref_ == 0);
|
2017-09-28 23:43:04 +00:00
|
|
|
if (write_after_commit_) {
|
2018-07-07 00:17:36 +00:00
|
|
|
// write_after_commit_ can only have one batch in trx.
|
|
|
|
assert(trx->batches_.size() == 1);
|
|
|
|
const auto& batch_info = trx->batches_.begin()->second;
|
2018-03-08 18:18:34 +00:00
|
|
|
// all inserts must reference this trx log number
|
2018-07-07 00:17:36 +00:00
|
|
|
log_number_ref_ = batch_info.log_number_;
|
2022-04-28 21:42:00 +00:00
|
|
|
ResetProtectionInfo();
|
2018-07-07 00:17:36 +00:00
|
|
|
s = batch_info.batch_->Iterate(this);
|
2017-09-28 23:43:04 +00:00
|
|
|
log_number_ref_ = 0;
|
|
|
|
}
|
|
|
|
// else the values are already inserted before the commit
|
2016-04-18 18:11:51 +00:00
|
|
|
|
|
|
|
if (s.ok()) {
|
|
|
|
db_->DeleteRecoveredTransaction(name.ToString());
|
|
|
|
}
|
Ignore stale logs while restarting DBs
Summary:
Stale log files can be deleted out of order. This can happen for various reasons. One of the reason is that no data is ever inserted to a column family and we have an optimization to update its log number, but not all the old log files are cleaned up (the case shown in the unit tests added). It can also happen when we simply delete multiple log files out of order.
This causes data corruption because we simply increase seqID after processing the next row and we may end up with writing data with smaller seqID than what is already flushed to memtables.
In DB recovery, for the oldest files we are replaying, if there it contains no data for any column family, we ignore the sequence IDs in the file.
Test Plan: Add two unit tests that fail without the fix.
Reviewers: IslamAbdelRahman, igor, yiwu
Reviewed By: yiwu
Subscribers: hermanlee4, yoshinorim, leveldb, andrewkr, dhruba
Differential Revision: https://reviews.facebook.net/D60891
2016-07-19 18:48:00 +00:00
|
|
|
if (has_valid_writes_ != nullptr) {
|
|
|
|
*has_valid_writes_ = true;
|
|
|
|
}
|
2016-04-18 18:11:51 +00:00
|
|
|
}
|
|
|
|
} else {
|
2017-09-28 23:43:04 +00:00
|
|
|
// When writes are not delayed until commit, there is no disconnect
|
|
|
|
// between a memtable write and the WAL that supports it. So the commit
|
|
|
|
// need not reference any log as the only log to which it depends.
|
|
|
|
assert(!write_after_commit_ || log_number_ref_ > 0);
|
2016-04-18 18:11:51 +00:00
|
|
|
}
|
2017-09-18 21:36:53 +00:00
|
|
|
const bool batch_boundry = true;
|
|
|
|
MaybeAdvanceSeq(batch_boundry);
|
2016-04-18 18:11:51 +00:00
|
|
|
|
2022-04-28 21:42:00 +00:00
|
|
|
if (UNLIKELY(s.IsTryAgain())) {
|
|
|
|
DecrementProtectionInfoIdxForTryAgain();
|
|
|
|
}
|
|
|
|
|
2016-04-18 18:11:51 +00:00
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2021-12-10 19:03:39 +00:00
|
|
|
Status MarkCommitWithTimestamp(const Slice& name,
|
|
|
|
const Slice& commit_ts) override {
|
|
|
|
assert(db_);
|
|
|
|
|
|
|
|
Status s;
|
|
|
|
|
|
|
|
if (recovering_log_number_ != 0) {
|
|
|
|
// In recovery, db mutex must be held.
|
|
|
|
db_->mutex()->AssertHeld();
|
|
|
|
// in recovery when we encounter a commit marker
|
|
|
|
// we lookup this transaction in our set of rebuilt transactions
|
|
|
|
// and commit.
|
|
|
|
auto trx = db_->GetRecoveredTransaction(name.ToString());
|
|
|
|
// the log containing the prepared section may have
|
|
|
|
// been released in the last incarnation because the
|
|
|
|
// data was flushed to L0
|
|
|
|
if (trx) {
|
|
|
|
// at this point individual CF lognumbers will prevent
|
|
|
|
// duplicate re-insertion of values.
|
|
|
|
assert(0 == log_number_ref_);
|
|
|
|
if (write_after_commit_) {
|
|
|
|
// write_after_commit_ can only have one batch in trx.
|
|
|
|
assert(trx->batches_.size() == 1);
|
|
|
|
const auto& batch_info = trx->batches_.begin()->second;
|
|
|
|
// all inserts must reference this trx log number
|
|
|
|
log_number_ref_ = batch_info.log_number_;
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
|
|
|
|
s = batch_info.batch_->UpdateTimestamps(
|
|
|
|
commit_ts, [this](uint32_t cf) {
|
|
|
|
assert(db_);
|
|
|
|
VersionSet* const vset = db_->GetVersionSet();
|
|
|
|
assert(vset);
|
|
|
|
ColumnFamilySet* const cf_set = vset->GetColumnFamilySet();
|
|
|
|
assert(cf_set);
|
|
|
|
ColumnFamilyData* cfd = cf_set->GetColumnFamily(cf);
|
|
|
|
assert(cfd);
|
|
|
|
const auto* const ucmp = cfd->user_comparator();
|
|
|
|
assert(ucmp);
|
|
|
|
return ucmp->timestamp_size();
|
|
|
|
});
|
2021-12-10 19:03:39 +00:00
|
|
|
if (s.ok()) {
|
2022-04-28 21:42:00 +00:00
|
|
|
ResetProtectionInfo();
|
2021-12-10 19:03:39 +00:00
|
|
|
s = batch_info.batch_->Iterate(this);
|
|
|
|
log_number_ref_ = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// else the values are already inserted before the commit
|
|
|
|
|
|
|
|
if (s.ok()) {
|
|
|
|
db_->DeleteRecoveredTransaction(name.ToString());
|
|
|
|
}
|
|
|
|
if (has_valid_writes_) {
|
|
|
|
*has_valid_writes_ = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// When writes are not delayed until commit, there is no connection
|
|
|
|
// between a memtable write and the WAL that supports it. So the commit
|
|
|
|
// need not reference any log as the only log to which it depends.
|
|
|
|
assert(!write_after_commit_ || log_number_ref_ > 0);
|
|
|
|
}
|
|
|
|
constexpr bool batch_boundary = true;
|
|
|
|
MaybeAdvanceSeq(batch_boundary);
|
|
|
|
|
2022-04-28 21:42:00 +00:00
|
|
|
if (UNLIKELY(s.IsTryAgain())) {
|
|
|
|
DecrementProtectionInfoIdxForTryAgain();
|
|
|
|
}
|
|
|
|
|
2021-12-10 19:03:39 +00:00
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2016-04-18 18:11:51 +00:00
|
|
|
Status MarkRollback(const Slice& name) override {
|
|
|
|
assert(db_);
|
|
|
|
|
|
|
|
if (recovering_log_number_ != 0) {
|
|
|
|
auto trx = db_->GetRecoveredTransaction(name.ToString());
|
|
|
|
|
|
|
|
// the log containing the transactions prep section
|
|
|
|
// may have been released in the previous incarnation
|
|
|
|
// because we knew it had been rolled back
|
|
|
|
if (trx != nullptr) {
|
|
|
|
db_->DeleteRecoveredTransaction(name.ToString());
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// in non recovery we simply ignore this tag
|
|
|
|
}
|
|
|
|
|
2017-11-15 16:19:57 +00:00
|
|
|
const bool batch_boundry = true;
|
|
|
|
MaybeAdvanceSeq(batch_boundry);
|
|
|
|
|
2016-04-18 18:11:51 +00:00
|
|
|
return Status::OK();
|
|
|
|
}
|
2016-07-07 21:45:29 +00:00
|
|
|
|
|
|
|
private:
|
|
|
|
MemTablePostProcessInfo* get_post_process_info(MemTable* mem) {
|
|
|
|
if (!concurrent_memtable_writes_) {
|
|
|
|
// No need to batch counters locally if we don't use concurrent mode.
|
|
|
|
return nullptr;
|
|
|
|
}
|
2017-03-22 18:07:52 +00:00
|
|
|
return &GetPostMap()[mem];
|
2016-07-07 21:45:29 +00:00
|
|
|
}
|
2011-05-21 02:17:43 +00:00
|
|
|
};
|
|
|
|
|
2022-11-02 21:34:24 +00:00
|
|
|
} // anonymous namespace
|
2022-04-28 21:42:00 +00:00
|
|
|
|
2015-01-06 20:44:21 +00:00
|
|
|
// This function can only be called in these conditions:
|
|
|
|
// 1) During Recovery()
|
2015-11-06 15:29:10 +00:00
|
|
|
// 2) During Write(), in a single-threaded write thread
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-14 23:59:07 +00:00
|
|
|
// 3) During Write(), in a concurrent context where memtables has been cloned
|
2015-11-06 15:29:10 +00:00
|
|
|
// The reason is that it calls memtables->Seek(), which has a stateful cache
|
2017-09-18 21:36:53 +00:00
|
|
|
Status WriteBatchInternal::InsertInto(
|
|
|
|
WriteThread::WriteGroup& write_group, SequenceNumber sequence,
|
|
|
|
ColumnFamilyMemTables* memtables, FlushScheduler* flush_scheduler,
|
Refactor trimming logic for immutable memtables (#5022)
Summary:
MyRocks currently sets `max_write_buffer_number_to_maintain` in order to maintain enough history for transaction conflict checking. The effectiveness of this approach depends on the size of memtables. When memtables are small, it may not keep enough history; when memtables are large, this may consume too much memory.
We are proposing a new way to configure memtable list history: by limiting the memory usage of immutable memtables. The new option is `max_write_buffer_size_to_maintain` and it will take precedence over the old `max_write_buffer_number_to_maintain` if they are both set to non-zero values. The new option accounts for the total memory usage of flushed immutable memtables and mutable memtable. When the total usage exceeds the limit, RocksDB may start dropping immutable memtables (which is also called trimming history), starting from the oldest one.
The semantics of the old option actually works both as an upper bound and lower bound. History trimming will start if number of immutable memtables exceeds the limit, but it will never go below (limit-1) due to history trimming.
In order the mimic the behavior with the new option, history trimming will stop if dropping the next immutable memtable causes the total memory usage go below the size limit. For example, assuming the size limit is set to 64MB, and there are 3 immutable memtables with sizes of 20, 30, 30. Although the total memory usage is 80MB > 64MB, dropping the oldest memtable will reduce the memory usage to 60MB < 64MB, so in this case no memtable will be dropped.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5022
Differential Revision: D14394062
Pulled By: miasantreble
fbshipit-source-id: 60457a509c6af89d0993f988c9b5c2aa9e45f5c5
2019-08-23 20:54:09 +00:00
|
|
|
TrimHistoryScheduler* trim_history_scheduler,
|
2017-09-18 21:36:53 +00:00
|
|
|
bool ignore_missing_column_families, uint64_t recovery_log_number, DB* db,
|
2018-06-29 01:46:39 +00:00
|
|
|
bool concurrent_memtable_writes, bool seq_per_batch, bool batch_per_txn) {
|
|
|
|
MemTableInserter inserter(
|
Refactor trimming logic for immutable memtables (#5022)
Summary:
MyRocks currently sets `max_write_buffer_number_to_maintain` in order to maintain enough history for transaction conflict checking. The effectiveness of this approach depends on the size of memtables. When memtables are small, it may not keep enough history; when memtables are large, this may consume too much memory.
We are proposing a new way to configure memtable list history: by limiting the memory usage of immutable memtables. The new option is `max_write_buffer_size_to_maintain` and it will take precedence over the old `max_write_buffer_number_to_maintain` if they are both set to non-zero values. The new option accounts for the total memory usage of flushed immutable memtables and mutable memtable. When the total usage exceeds the limit, RocksDB may start dropping immutable memtables (which is also called trimming history), starting from the oldest one.
The semantics of the old option actually works both as an upper bound and lower bound. History trimming will start if number of immutable memtables exceeds the limit, but it will never go below (limit-1) due to history trimming.
In order the mimic the behavior with the new option, history trimming will stop if dropping the next immutable memtable causes the total memory usage go below the size limit. For example, assuming the size limit is set to 64MB, and there are 3 immutable memtables with sizes of 20, 30, 30. Although the total memory usage is 80MB > 64MB, dropping the oldest memtable will reduce the memory usage to 60MB < 64MB, so in this case no memtable will be dropped.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5022
Differential Revision: D14394062
Pulled By: miasantreble
fbshipit-source-id: 60457a509c6af89d0993f988c9b5c2aa9e45f5c5
2019-08-23 20:54:09 +00:00
|
|
|
sequence, memtables, flush_scheduler, trim_history_scheduler,
|
|
|
|
ignore_missing_column_families, recovery_log_number, db,
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
concurrent_memtable_writes, nullptr /* prot_info */,
|
|
|
|
nullptr /*has_valid_writes*/, seq_per_batch, batch_per_txn);
|
2017-05-19 21:24:23 +00:00
|
|
|
for (auto w : write_group) {
|
2017-11-15 16:22:54 +00:00
|
|
|
if (w->CallbackFailed()) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
w->sequence = inserter.sequence();
|
2016-04-18 18:11:51 +00:00
|
|
|
if (!w->ShouldWriteToMemtable()) {
|
2017-11-15 16:22:54 +00:00
|
|
|
// In seq_per_batch_ mode this advances the seq by one.
|
2017-10-06 21:18:30 +00:00
|
|
|
inserter.MaybeAdvanceSeq(true);
|
2016-04-18 18:11:51 +00:00
|
|
|
continue;
|
|
|
|
}
|
2017-05-31 17:45:47 +00:00
|
|
|
SetSequence(w->batch, inserter.sequence());
|
2016-04-18 18:11:51 +00:00
|
|
|
inserter.set_log_number_ref(w->log_ref);
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
inserter.set_prot_info(w->batch->prot_info_.get());
|
2016-04-18 18:11:51 +00:00
|
|
|
w->status = w->batch->Iterate(&inserter);
|
|
|
|
if (!w->status.ok()) {
|
|
|
|
return w->status;
|
2016-02-05 18:44:13 +00:00
|
|
|
}
|
2018-02-06 02:32:54 +00:00
|
|
|
assert(!seq_per_batch || w->batch_cnt != 0);
|
|
|
|
assert(!seq_per_batch || inserter.sequence() - w->sequence == w->batch_cnt);
|
2015-11-06 15:29:10 +00:00
|
|
|
}
|
2016-02-05 18:44:13 +00:00
|
|
|
return Status::OK();
|
2015-11-06 15:29:10 +00:00
|
|
|
}
|
|
|
|
|
2017-10-06 21:18:30 +00:00
|
|
|
Status WriteBatchInternal::InsertInto(
|
|
|
|
WriteThread::Writer* writer, SequenceNumber sequence,
|
|
|
|
ColumnFamilyMemTables* memtables, FlushScheduler* flush_scheduler,
|
Refactor trimming logic for immutable memtables (#5022)
Summary:
MyRocks currently sets `max_write_buffer_number_to_maintain` in order to maintain enough history for transaction conflict checking. The effectiveness of this approach depends on the size of memtables. When memtables are small, it may not keep enough history; when memtables are large, this may consume too much memory.
We are proposing a new way to configure memtable list history: by limiting the memory usage of immutable memtables. The new option is `max_write_buffer_size_to_maintain` and it will take precedence over the old `max_write_buffer_number_to_maintain` if they are both set to non-zero values. The new option accounts for the total memory usage of flushed immutable memtables and mutable memtable. When the total usage exceeds the limit, RocksDB may start dropping immutable memtables (which is also called trimming history), starting from the oldest one.
The semantics of the old option actually works both as an upper bound and lower bound. History trimming will start if number of immutable memtables exceeds the limit, but it will never go below (limit-1) due to history trimming.
In order the mimic the behavior with the new option, history trimming will stop if dropping the next immutable memtable causes the total memory usage go below the size limit. For example, assuming the size limit is set to 64MB, and there are 3 immutable memtables with sizes of 20, 30, 30. Although the total memory usage is 80MB > 64MB, dropping the oldest memtable will reduce the memory usage to 60MB < 64MB, so in this case no memtable will be dropped.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5022
Differential Revision: D14394062
Pulled By: miasantreble
fbshipit-source-id: 60457a509c6af89d0993f988c9b5c2aa9e45f5c5
2019-08-23 20:54:09 +00:00
|
|
|
TrimHistoryScheduler* trim_history_scheduler,
|
2017-10-06 21:18:30 +00:00
|
|
|
bool ignore_missing_column_families, uint64_t log_number, DB* db,
|
2018-06-29 01:46:39 +00:00
|
|
|
bool concurrent_memtable_writes, bool seq_per_batch, size_t batch_cnt,
|
2019-09-12 23:53:31 +00:00
|
|
|
bool batch_per_txn, bool hint_per_batch) {
|
2018-04-13 00:55:14 +00:00
|
|
|
#ifdef NDEBUG
|
|
|
|
(void)batch_cnt;
|
|
|
|
#endif
|
2017-05-31 17:45:47 +00:00
|
|
|
assert(writer->ShouldWriteToMemtable());
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
MemTableInserter inserter(sequence, memtables, flush_scheduler,
|
|
|
|
trim_history_scheduler,
|
|
|
|
ignore_missing_column_families, log_number, db,
|
|
|
|
concurrent_memtable_writes, nullptr /* prot_info */,
|
|
|
|
nullptr /*has_valid_writes*/, seq_per_batch,
|
|
|
|
batch_per_txn, hint_per_batch);
|
2017-05-31 17:45:47 +00:00
|
|
|
SetSequence(writer->batch, sequence);
|
2016-04-18 18:11:51 +00:00
|
|
|
inserter.set_log_number_ref(writer->log_ref);
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
inserter.set_prot_info(writer->batch->prot_info_.get());
|
2016-07-07 21:45:29 +00:00
|
|
|
Status s = writer->batch->Iterate(&inserter);
|
2018-02-06 02:32:54 +00:00
|
|
|
assert(!seq_per_batch || batch_cnt != 0);
|
|
|
|
assert(!seq_per_batch || inserter.sequence() - sequence == batch_cnt);
|
2016-07-07 21:45:29 +00:00
|
|
|
if (concurrent_memtable_writes) {
|
|
|
|
inserter.PostProcess();
|
|
|
|
}
|
|
|
|
return s;
|
2016-04-18 18:11:51 +00:00
|
|
|
}
|
|
|
|
|
Ignore stale logs while restarting DBs
Summary:
Stale log files can be deleted out of order. This can happen for various reasons. One of the reason is that no data is ever inserted to a column family and we have an optimization to update its log number, but not all the old log files are cleaned up (the case shown in the unit tests added). It can also happen when we simply delete multiple log files out of order.
This causes data corruption because we simply increase seqID after processing the next row and we may end up with writing data with smaller seqID than what is already flushed to memtables.
In DB recovery, for the oldest files we are replaying, if there it contains no data for any column family, we ignore the sequence IDs in the file.
Test Plan: Add two unit tests that fail without the fix.
Reviewers: IslamAbdelRahman, igor, yiwu
Reviewed By: yiwu
Subscribers: hermanlee4, yoshinorim, leveldb, andrewkr, dhruba
Differential Revision: https://reviews.facebook.net/D60891
2016-07-19 18:48:00 +00:00
|
|
|
Status WriteBatchInternal::InsertInto(
|
|
|
|
const WriteBatch* batch, ColumnFamilyMemTables* memtables,
|
Refactor trimming logic for immutable memtables (#5022)
Summary:
MyRocks currently sets `max_write_buffer_number_to_maintain` in order to maintain enough history for transaction conflict checking. The effectiveness of this approach depends on the size of memtables. When memtables are small, it may not keep enough history; when memtables are large, this may consume too much memory.
We are proposing a new way to configure memtable list history: by limiting the memory usage of immutable memtables. The new option is `max_write_buffer_size_to_maintain` and it will take precedence over the old `max_write_buffer_number_to_maintain` if they are both set to non-zero values. The new option accounts for the total memory usage of flushed immutable memtables and mutable memtable. When the total usage exceeds the limit, RocksDB may start dropping immutable memtables (which is also called trimming history), starting from the oldest one.
The semantics of the old option actually works both as an upper bound and lower bound. History trimming will start if number of immutable memtables exceeds the limit, but it will never go below (limit-1) due to history trimming.
In order the mimic the behavior with the new option, history trimming will stop if dropping the next immutable memtable causes the total memory usage go below the size limit. For example, assuming the size limit is set to 64MB, and there are 3 immutable memtables with sizes of 20, 30, 30. Although the total memory usage is 80MB > 64MB, dropping the oldest memtable will reduce the memory usage to 60MB < 64MB, so in this case no memtable will be dropped.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5022
Differential Revision: D14394062
Pulled By: miasantreble
fbshipit-source-id: 60457a509c6af89d0993f988c9b5c2aa9e45f5c5
2019-08-23 20:54:09 +00:00
|
|
|
FlushScheduler* flush_scheduler,
|
|
|
|
TrimHistoryScheduler* trim_history_scheduler,
|
|
|
|
bool ignore_missing_column_families, uint64_t log_number, DB* db,
|
|
|
|
bool concurrent_memtable_writes, SequenceNumber* next_seq,
|
|
|
|
bool* has_valid_writes, bool seq_per_batch, bool batch_per_txn) {
|
2017-05-31 17:45:47 +00:00
|
|
|
MemTableInserter inserter(Sequence(batch), memtables, flush_scheduler,
|
Refactor trimming logic for immutable memtables (#5022)
Summary:
MyRocks currently sets `max_write_buffer_number_to_maintain` in order to maintain enough history for transaction conflict checking. The effectiveness of this approach depends on the size of memtables. When memtables are small, it may not keep enough history; when memtables are large, this may consume too much memory.
We are proposing a new way to configure memtable list history: by limiting the memory usage of immutable memtables. The new option is `max_write_buffer_size_to_maintain` and it will take precedence over the old `max_write_buffer_number_to_maintain` if they are both set to non-zero values. The new option accounts for the total memory usage of flushed immutable memtables and mutable memtable. When the total usage exceeds the limit, RocksDB may start dropping immutable memtables (which is also called trimming history), starting from the oldest one.
The semantics of the old option actually works both as an upper bound and lower bound. History trimming will start if number of immutable memtables exceeds the limit, but it will never go below (limit-1) due to history trimming.
In order the mimic the behavior with the new option, history trimming will stop if dropping the next immutable memtable causes the total memory usage go below the size limit. For example, assuming the size limit is set to 64MB, and there are 3 immutable memtables with sizes of 20, 30, 30. Although the total memory usage is 80MB > 64MB, dropping the oldest memtable will reduce the memory usage to 60MB < 64MB, so in this case no memtable will be dropped.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5022
Differential Revision: D14394062
Pulled By: miasantreble
fbshipit-source-id: 60457a509c6af89d0993f988c9b5c2aa9e45f5c5
2019-08-23 20:54:09 +00:00
|
|
|
trim_history_scheduler,
|
2017-05-31 17:45:47 +00:00
|
|
|
ignore_missing_column_families, log_number, db,
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
concurrent_memtable_writes, batch->prot_info_.get(),
|
|
|
|
has_valid_writes, seq_per_batch, batch_per_txn);
|
[rocksdb] Recovery path sequence miscount fix
Summary:
Consider the following WAL with 4 batch entries prefixed with their sequence at time of memtable insert.
[1: BEGIN_PREPARE, PUT, PUT, PUT, PUT, END_PREPARE(a)]
[1: BEGIN_PREPARE, PUT, PUT, PUT, PUT, END_PREPARE(b)]
[4: COMMIT(a)]
[7: COMMIT(b)]
The first two batches do not consume any sequence numbers so are both prefixed with seq=1.
For 2pc commit, memtable insertion takes place before COMMIT batch is written to WAL.
We can see that sequence number consumption takes place between WAL entries giving us the seemingly sparse sequence prefix for WAL entries.
This is a valid WAL.
Because with 2PC markers one WriteBatch points to another batch containing its inserts a writebatch can consume more or less sequence numbers than the number of sequence consuming entries that it contains.
We can see that, given the entries in the WAL, 6 sequence ids were consumed. Yet on recovery the maximum sequence consumed would be 7 + 3 (the number of sequence numbers consumed by COMMIT(b))
So, now upon recovery we must track the actual consumption of sequence numbers.
In the provided scenario there will be no sequence gaps, but it is possible to produce a sequence gap. This should not be a problem though. correct?
Test Plan: provided test.
Reviewers: sdong
Subscribers: andrewkr, leveldb, dhruba, hermanlee4
Differential Revision: https://reviews.facebook.net/D57645
2016-05-04 21:02:27 +00:00
|
|
|
Status s = batch->Iterate(&inserter);
|
2017-09-18 21:36:53 +00:00
|
|
|
if (next_seq != nullptr) {
|
|
|
|
*next_seq = inserter.sequence();
|
[rocksdb] Recovery path sequence miscount fix
Summary:
Consider the following WAL with 4 batch entries prefixed with their sequence at time of memtable insert.
[1: BEGIN_PREPARE, PUT, PUT, PUT, PUT, END_PREPARE(a)]
[1: BEGIN_PREPARE, PUT, PUT, PUT, PUT, END_PREPARE(b)]
[4: COMMIT(a)]
[7: COMMIT(b)]
The first two batches do not consume any sequence numbers so are both prefixed with seq=1.
For 2pc commit, memtable insertion takes place before COMMIT batch is written to WAL.
We can see that sequence number consumption takes place between WAL entries giving us the seemingly sparse sequence prefix for WAL entries.
This is a valid WAL.
Because with 2PC markers one WriteBatch points to another batch containing its inserts a writebatch can consume more or less sequence numbers than the number of sequence consuming entries that it contains.
We can see that, given the entries in the WAL, 6 sequence ids were consumed. Yet on recovery the maximum sequence consumed would be 7 + 3 (the number of sequence numbers consumed by COMMIT(b))
So, now upon recovery we must track the actual consumption of sequence numbers.
In the provided scenario there will be no sequence gaps, but it is possible to produce a sequence gap. This should not be a problem though. correct?
Test Plan: provided test.
Reviewers: sdong
Subscribers: andrewkr, leveldb, dhruba, hermanlee4
Differential Revision: https://reviews.facebook.net/D57645
2016-05-04 21:02:27 +00:00
|
|
|
}
|
2016-07-07 21:45:29 +00:00
|
|
|
if (concurrent_memtable_writes) {
|
|
|
|
inserter.PostProcess();
|
|
|
|
}
|
[rocksdb] Recovery path sequence miscount fix
Summary:
Consider the following WAL with 4 batch entries prefixed with their sequence at time of memtable insert.
[1: BEGIN_PREPARE, PUT, PUT, PUT, PUT, END_PREPARE(a)]
[1: BEGIN_PREPARE, PUT, PUT, PUT, PUT, END_PREPARE(b)]
[4: COMMIT(a)]
[7: COMMIT(b)]
The first two batches do not consume any sequence numbers so are both prefixed with seq=1.
For 2pc commit, memtable insertion takes place before COMMIT batch is written to WAL.
We can see that sequence number consumption takes place between WAL entries giving us the seemingly sparse sequence prefix for WAL entries.
This is a valid WAL.
Because with 2PC markers one WriteBatch points to another batch containing its inserts a writebatch can consume more or less sequence numbers than the number of sequence consuming entries that it contains.
We can see that, given the entries in the WAL, 6 sequence ids were consumed. Yet on recovery the maximum sequence consumed would be 7 + 3 (the number of sequence numbers consumed by COMMIT(b))
So, now upon recovery we must track the actual consumption of sequence numbers.
In the provided scenario there will be no sequence gaps, but it is possible to produce a sequence gap. This should not be a problem though. correct?
Test Plan: provided test.
Reviewers: sdong
Subscribers: andrewkr, leveldb, dhruba, hermanlee4
Differential Revision: https://reviews.facebook.net/D57645
2016-05-04 21:02:27 +00:00
|
|
|
return s;
|
2014-01-28 19:05:04 +00:00
|
|
|
}
|
|
|
|
|
2022-04-28 21:42:00 +00:00
|
|
|
namespace {
|
|
|
|
|
|
|
|
// This class updates protection info for a WriteBatch.
|
|
|
|
class ProtectionInfoUpdater : public WriteBatch::Handler {
|
|
|
|
public:
|
|
|
|
explicit ProtectionInfoUpdater(WriteBatch::ProtectionInfo* prot_info)
|
|
|
|
: prot_info_(prot_info) {}
|
|
|
|
|
2023-12-04 19:17:32 +00:00
|
|
|
~ProtectionInfoUpdater() override = default;
|
2022-04-28 21:42:00 +00:00
|
|
|
|
|
|
|
Status PutCF(uint32_t cf, const Slice& key, const Slice& val) override {
|
|
|
|
return UpdateProtInfo(cf, key, val, kTypeValue);
|
|
|
|
}
|
|
|
|
|
Add initial support for TimedPut API (#12419)
Summary:
This PR adds support for `TimedPut` API. We introduced a new type `kTypeValuePreferredSeqno` for entries added to the DB via the `TimedPut` API.
The life cycle of such an entry on the write/flush/compaction paths are:
1) It is initially added to memtable as:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, write_unix_time}`
2) When it's flushed to L0 sst files, it's converted to:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, preferred_seqno}`
when we have easy access to the seqno to time mapping.
3) During compaction, if certain conditions are met, we swap in the `preferred_seqno` and the entry will become:
`<user_key, preferred_seqno, kTypeValue>: value`. This step helps fast track these entries to the cold tier if they are eligible after the sequence number swap.
On the read path:
A `kTypeValuePreferredSeqno` entry acts the same as a `kTypeValue` entry, the unix_write_time/preferred seqno part packed in value is completely ignored.
Needed follow ups:
1) The seqno to time mapping accessible in flush needs to be extended to cover the `write_unix_time` for possible `kTypeValuePreferredSeqno` entries. This also means we need to track these `write_unix_time` in memtable.
2) Compaction filter support for the new `kTypeValuePreferredSeqno` type for feature parity with other `kTypeValue` and equivalent types.
3) Stress test coverage for the feature
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12419
Test Plan: Added unit tests
Reviewed By: pdillinger
Differential Revision: D54920296
Pulled By: jowlyzhang
fbshipit-source-id: c8b43f7a7c465e569141770e93c748371ff1da9e
2024-03-14 22:44:55 +00:00
|
|
|
Status TimedPutCF(uint32_t cf, const Slice& key, const Slice& val,
|
2024-04-30 22:40:35 +00:00
|
|
|
uint64_t unix_write_time) override {
|
|
|
|
std::string encoded_write_time;
|
|
|
|
PutFixed64(&encoded_write_time, unix_write_time);
|
|
|
|
std::array<Slice, 2> value_with_time{{val, encoded_write_time}};
|
|
|
|
SliceParts packed_value(value_with_time.data(), 2);
|
|
|
|
return UpdateProtInfo(cf, key, packed_value, kTypeValuePreferredSeqno);
|
Add initial support for TimedPut API (#12419)
Summary:
This PR adds support for `TimedPut` API. We introduced a new type `kTypeValuePreferredSeqno` for entries added to the DB via the `TimedPut` API.
The life cycle of such an entry on the write/flush/compaction paths are:
1) It is initially added to memtable as:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, write_unix_time}`
2) When it's flushed to L0 sst files, it's converted to:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, preferred_seqno}`
when we have easy access to the seqno to time mapping.
3) During compaction, if certain conditions are met, we swap in the `preferred_seqno` and the entry will become:
`<user_key, preferred_seqno, kTypeValue>: value`. This step helps fast track these entries to the cold tier if they are eligible after the sequence number swap.
On the read path:
A `kTypeValuePreferredSeqno` entry acts the same as a `kTypeValue` entry, the unix_write_time/preferred seqno part packed in value is completely ignored.
Needed follow ups:
1) The seqno to time mapping accessible in flush needs to be extended to cover the `write_unix_time` for possible `kTypeValuePreferredSeqno` entries. This also means we need to track these `write_unix_time` in memtable.
2) Compaction filter support for the new `kTypeValuePreferredSeqno` type for feature parity with other `kTypeValue` and equivalent types.
3) Stress test coverage for the feature
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12419
Test Plan: Added unit tests
Reviewed By: pdillinger
Differential Revision: D54920296
Pulled By: jowlyzhang
fbshipit-source-id: c8b43f7a7c465e569141770e93c748371ff1da9e
2024-03-14 22:44:55 +00:00
|
|
|
}
|
|
|
|
|
2022-06-25 22:30:47 +00:00
|
|
|
Status PutEntityCF(uint32_t cf, const Slice& key,
|
|
|
|
const Slice& entity) override {
|
|
|
|
return UpdateProtInfo(cf, key, entity, kTypeWideColumnEntity);
|
|
|
|
}
|
|
|
|
|
2022-04-28 21:42:00 +00:00
|
|
|
Status DeleteCF(uint32_t cf, const Slice& key) override {
|
|
|
|
return UpdateProtInfo(cf, key, "", kTypeDeletion);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status SingleDeleteCF(uint32_t cf, const Slice& key) override {
|
|
|
|
return UpdateProtInfo(cf, key, "", kTypeSingleDeletion);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DeleteRangeCF(uint32_t cf, const Slice& begin_key,
|
|
|
|
const Slice& end_key) override {
|
|
|
|
return UpdateProtInfo(cf, begin_key, end_key, kTypeRangeDeletion);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status MergeCF(uint32_t cf, const Slice& key, const Slice& val) override {
|
|
|
|
return UpdateProtInfo(cf, key, val, kTypeMerge);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status PutBlobIndexCF(uint32_t cf, const Slice& key,
|
|
|
|
const Slice& val) override {
|
|
|
|
return UpdateProtInfo(cf, key, val, kTypeBlobIndex);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status MarkBeginPrepare(bool /* unprepare */) override {
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
Status MarkEndPrepare(const Slice& /* xid */) override {
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
Status MarkCommit(const Slice& /* xid */) override { return Status::OK(); }
|
|
|
|
|
|
|
|
Status MarkCommitWithTimestamp(const Slice& /* xid */,
|
|
|
|
const Slice& /* ts */) override {
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
Status MarkRollback(const Slice& /* xid */) override { return Status::OK(); }
|
|
|
|
|
|
|
|
Status MarkNoop(bool /* empty_batch */) override { return Status::OK(); }
|
|
|
|
|
|
|
|
private:
|
|
|
|
Status UpdateProtInfo(uint32_t cf, const Slice& key, const Slice& val,
|
|
|
|
const ValueType op_type) {
|
|
|
|
if (prot_info_) {
|
|
|
|
prot_info_->entries_.emplace_back(
|
|
|
|
ProtectionInfo64().ProtectKVO(key, val, op_type).ProtectC(cf));
|
|
|
|
}
|
|
|
|
return Status::OK();
|
2024-04-30 22:40:35 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
Status UpdateProtInfo(uint32_t cf, const Slice& key, const SliceParts& val,
|
|
|
|
const ValueType op_type) {
|
|
|
|
if (prot_info_) {
|
|
|
|
prot_info_->entries_.emplace_back(
|
|
|
|
ProtectionInfo64()
|
|
|
|
.ProtectKVO(SliceParts(&key, 1), val, op_type)
|
|
|
|
.ProtectC(cf));
|
|
|
|
}
|
|
|
|
return Status::OK();
|
2022-04-28 21:42:00 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// No copy or move.
|
|
|
|
ProtectionInfoUpdater(const ProtectionInfoUpdater&) = delete;
|
|
|
|
ProtectionInfoUpdater(ProtectionInfoUpdater&&) = delete;
|
|
|
|
ProtectionInfoUpdater& operator=(const ProtectionInfoUpdater&) = delete;
|
|
|
|
ProtectionInfoUpdater& operator=(ProtectionInfoUpdater&&) = delete;
|
|
|
|
|
|
|
|
WriteBatch::ProtectionInfo* const prot_info_ = nullptr;
|
|
|
|
};
|
|
|
|
|
2022-11-02 21:34:24 +00:00
|
|
|
} // anonymous namespace
|
2022-04-28 21:42:00 +00:00
|
|
|
|
2017-04-10 22:38:34 +00:00
|
|
|
Status WriteBatchInternal::SetContents(WriteBatch* b, const Slice& contents) {
|
2016-03-30 17:43:00 +00:00
|
|
|
assert(contents.size() >= WriteBatchInternal::kHeader);
|
2022-06-17 06:10:07 +00:00
|
|
|
assert(b->prot_info_ == nullptr);
|
2022-04-28 21:42:00 +00:00
|
|
|
|
2011-03-18 22:37:00 +00:00
|
|
|
b->rep_.assign(contents.data(), contents.size());
|
2015-11-06 15:03:30 +00:00
|
|
|
b->content_flags_.store(ContentFlags::DEFERRED, std::memory_order_relaxed);
|
2017-04-10 22:38:34 +00:00
|
|
|
return Status::OK();
|
2011-03-18 22:37:00 +00:00
|
|
|
}
|
|
|
|
|
2017-04-10 22:38:34 +00:00
|
|
|
Status WriteBatchInternal::Append(WriteBatch* dst, const WriteBatch* src,
|
|
|
|
const bool wal_only) {
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
assert(dst->Count() == 0 ||
|
|
|
|
(dst->prot_info_ == nullptr) == (src->prot_info_ == nullptr));
|
2022-06-15 20:43:58 +00:00
|
|
|
if ((src->prot_info_ != nullptr &&
|
|
|
|
src->prot_info_->entries_.size() != src->Count()) ||
|
|
|
|
(dst->prot_info_ != nullptr &&
|
|
|
|
dst->prot_info_->entries_.size() != dst->Count())) {
|
|
|
|
return Status::Corruption(
|
|
|
|
"Write batch has inconsistent count and number of checksums");
|
|
|
|
}
|
|
|
|
|
Add facility to write only a portion of WriteBatch to WAL
Summary:
When constructing a write batch a client may now call MarkWalTerminationPoint() on that batch. No batch operations after this call will be added written to the WAL but will still be inserted into the Memtable. This facility is used to remove one of the three WriteImpl calls in 2PC transactions. This produces a ~1% perf improvement.
```
RocksDB - unoptimized 2pc, sync_binlog=1, disable_2pc=off
INFO 2016-08-31 14:30:38,814 [main]: REQUEST PHASE COMPLETED. 75000000 requests done in 2619 seconds. Requests/second = 28628
RocksDB - optimized 2pc , sync_binlog=1, disable_2pc=off
INFO 2016-08-31 16:26:59,442 [main]: REQUEST PHASE COMPLETED. 75000000 requests done in 2581 seconds. Requests/second = 29054
```
Test Plan: Two unit tests added.
Reviewers: sdong, yiwu, IslamAbdelRahman
Reviewed By: yiwu
Subscribers: hermanlee4, dhruba, andrewkr
Differential Revision: https://reviews.facebook.net/D64599
2016-10-07 18:31:26 +00:00
|
|
|
size_t src_len;
|
|
|
|
int src_count;
|
|
|
|
uint32_t src_flags;
|
|
|
|
|
|
|
|
const SavePoint& batch_end = src->GetWalTerminationPoint();
|
|
|
|
|
|
|
|
if (wal_only && !batch_end.is_cleared()) {
|
|
|
|
src_len = batch_end.size - WriteBatchInternal::kHeader;
|
|
|
|
src_count = batch_end.count;
|
|
|
|
src_flags = batch_end.content_flags;
|
|
|
|
} else {
|
|
|
|
src_len = src->rep_.size() - WriteBatchInternal::kHeader;
|
|
|
|
src_count = Count(src);
|
|
|
|
src_flags = src->content_flags_.load(std::memory_order_relaxed);
|
|
|
|
}
|
|
|
|
|
2022-06-18 22:12:17 +00:00
|
|
|
if (src->prot_info_ != nullptr) {
|
|
|
|
if (dst->prot_info_ == nullptr) {
|
|
|
|
dst->prot_info_.reset(new WriteBatch::ProtectionInfo());
|
|
|
|
}
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
std::copy(src->prot_info_->entries_.begin(),
|
|
|
|
src->prot_info_->entries_.begin() + src_count,
|
|
|
|
std::back_inserter(dst->prot_info_->entries_));
|
2022-06-18 22:12:17 +00:00
|
|
|
} else if (dst->prot_info_ != nullptr) {
|
|
|
|
// dst has empty prot_info->entries
|
|
|
|
// In this special case, we allow write batch without prot_info to
|
|
|
|
// be appende to write batch with empty prot_info
|
|
|
|
dst->prot_info_ = nullptr;
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
}
|
Add facility to write only a portion of WriteBatch to WAL
Summary:
When constructing a write batch a client may now call MarkWalTerminationPoint() on that batch. No batch operations after this call will be added written to the WAL but will still be inserted into the Memtable. This facility is used to remove one of the three WriteImpl calls in 2PC transactions. This produces a ~1% perf improvement.
```
RocksDB - unoptimized 2pc, sync_binlog=1, disable_2pc=off
INFO 2016-08-31 14:30:38,814 [main]: REQUEST PHASE COMPLETED. 75000000 requests done in 2619 seconds. Requests/second = 28628
RocksDB - optimized 2pc , sync_binlog=1, disable_2pc=off
INFO 2016-08-31 16:26:59,442 [main]: REQUEST PHASE COMPLETED. 75000000 requests done in 2581 seconds. Requests/second = 29054
```
Test Plan: Two unit tests added.
Reviewers: sdong, yiwu, IslamAbdelRahman
Reviewed By: yiwu
Subscribers: hermanlee4, dhruba, andrewkr
Differential Revision: https://reviews.facebook.net/D64599
2016-10-07 18:31:26 +00:00
|
|
|
SetCount(dst, Count(dst) + src_count);
|
2016-03-30 17:43:00 +00:00
|
|
|
assert(src->rep_.size() >= WriteBatchInternal::kHeader);
|
Add facility to write only a portion of WriteBatch to WAL
Summary:
When constructing a write batch a client may now call MarkWalTerminationPoint() on that batch. No batch operations after this call will be added written to the WAL but will still be inserted into the Memtable. This facility is used to remove one of the three WriteImpl calls in 2PC transactions. This produces a ~1% perf improvement.
```
RocksDB - unoptimized 2pc, sync_binlog=1, disable_2pc=off
INFO 2016-08-31 14:30:38,814 [main]: REQUEST PHASE COMPLETED. 75000000 requests done in 2619 seconds. Requests/second = 28628
RocksDB - optimized 2pc , sync_binlog=1, disable_2pc=off
INFO 2016-08-31 16:26:59,442 [main]: REQUEST PHASE COMPLETED. 75000000 requests done in 2581 seconds. Requests/second = 29054
```
Test Plan: Two unit tests added.
Reviewers: sdong, yiwu, IslamAbdelRahman
Reviewed By: yiwu
Subscribers: hermanlee4, dhruba, andrewkr
Differential Revision: https://reviews.facebook.net/D64599
2016-10-07 18:31:26 +00:00
|
|
|
dst->rep_.append(src->rep_.data() + WriteBatchInternal::kHeader, src_len);
|
2015-11-06 15:03:30 +00:00
|
|
|
dst->content_flags_.store(
|
Add facility to write only a portion of WriteBatch to WAL
Summary:
When constructing a write batch a client may now call MarkWalTerminationPoint() on that batch. No batch operations after this call will be added written to the WAL but will still be inserted into the Memtable. This facility is used to remove one of the three WriteImpl calls in 2PC transactions. This produces a ~1% perf improvement.
```
RocksDB - unoptimized 2pc, sync_binlog=1, disable_2pc=off
INFO 2016-08-31 14:30:38,814 [main]: REQUEST PHASE COMPLETED. 75000000 requests done in 2619 seconds. Requests/second = 28628
RocksDB - optimized 2pc , sync_binlog=1, disable_2pc=off
INFO 2016-08-31 16:26:59,442 [main]: REQUEST PHASE COMPLETED. 75000000 requests done in 2581 seconds. Requests/second = 29054
```
Test Plan: Two unit tests added.
Reviewers: sdong, yiwu, IslamAbdelRahman
Reviewed By: yiwu
Subscribers: hermanlee4, dhruba, andrewkr
Differential Revision: https://reviews.facebook.net/D64599
2016-10-07 18:31:26 +00:00
|
|
|
dst->content_flags_.load(std::memory_order_relaxed) | src_flags,
|
2015-11-06 15:03:30 +00:00
|
|
|
std::memory_order_relaxed);
|
2017-04-10 22:38:34 +00:00
|
|
|
return Status::OK();
|
2012-03-09 00:23:21 +00:00
|
|
|
}
|
|
|
|
|
2015-11-06 15:29:10 +00:00
|
|
|
size_t WriteBatchInternal::AppendedByteSize(size_t leftByteSize,
|
|
|
|
size_t rightByteSize) {
|
|
|
|
if (leftByteSize == 0 || rightByteSize == 0) {
|
|
|
|
return leftByteSize + rightByteSize;
|
|
|
|
} else {
|
2016-03-30 17:43:00 +00:00
|
|
|
return leftByteSize + rightByteSize - WriteBatchInternal::kHeader;
|
2015-11-06 15:29:10 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-06-17 06:10:07 +00:00
|
|
|
Status WriteBatchInternal::UpdateProtectionInfo(WriteBatch* wb,
|
2022-07-05 22:44:35 +00:00
|
|
|
size_t bytes_per_key,
|
|
|
|
uint64_t* checksum) {
|
2022-06-17 06:10:07 +00:00
|
|
|
if (bytes_per_key == 0) {
|
|
|
|
if (wb->prot_info_ != nullptr) {
|
|
|
|
wb->prot_info_.reset();
|
|
|
|
return Status::OK();
|
|
|
|
} else {
|
|
|
|
// Already not protected.
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
} else if (bytes_per_key == 8) {
|
|
|
|
if (wb->prot_info_ == nullptr) {
|
|
|
|
wb->prot_info_.reset(new WriteBatch::ProtectionInfo());
|
|
|
|
ProtectionInfoUpdater prot_info_updater(wb->prot_info_.get());
|
2022-07-05 22:44:35 +00:00
|
|
|
Status s = wb->Iterate(&prot_info_updater);
|
|
|
|
if (s.ok() && checksum != nullptr) {
|
|
|
|
uint64_t expected_hash = XXH3_64bits(wb->rep_.data(), wb->rep_.size());
|
|
|
|
if (expected_hash != *checksum) {
|
|
|
|
return Status::Corruption("Write batch content corrupted.");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return s;
|
2022-06-17 06:10:07 +00:00
|
|
|
} else {
|
|
|
|
// Already protected.
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return Status::NotSupported(
|
|
|
|
"WriteBatch protection info must be zero or eight bytes/key");
|
|
|
|
}
|
|
|
|
|
2020-02-20 20:07:53 +00:00
|
|
|
} // namespace ROCKSDB_NAMESPACE
|