2016-02-09 23:12:00 +00:00
|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
2017-07-15 23:03:42 +00:00
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
2013-10-16 21:59:46 +00:00
|
|
|
//
|
2011-03-18 22:37:00 +00:00
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
|
2013-10-05 05:32:05 +00:00
|
|
|
#pragma once
|
2011-03-18 22:37:00 +00:00
|
|
|
#include <stdio.h>
|
2021-09-29 11:01:57 +00:00
|
|
|
|
Introduce a new MultiGet batching implementation (#5011)
Summary:
This PR introduces a new MultiGet() API, with the underlying implementation grouping keys based on SST file and batching lookups in a file. The reason for the new API is twofold - the definition allows callers to allocate storage for status and values on stack instead of std::vector, as well as return values as PinnableSlices in order to avoid copying, and it keeps the original MultiGet() implementation intact while we experiment with batching.
Batching is useful when there is some spatial locality to the keys being queries, as well as larger batch sizes. The main benefits are due to -
1. Fewer function calls, especially to BlockBasedTableReader::MultiGet() and FullFilterBlockReader::KeysMayMatch()
2. Bloom filter cachelines can be prefetched, hiding the cache miss latency
The next step is to optimize the binary searches in the level_storage_info, index blocks and data blocks, since we could reduce the number of key comparisons if the keys are relatively close to each other. The batching optimizations also need to be extended to other formats, such as PlainTable and filter formats. This also needs to be added to db_stress.
Benchmark results from db_bench for various batch size/locality of reference combinations are given below. Locality was simulated by offsetting the keys in a batch by a stride length. Each SST file is about 8.6MB uncompressed and key/value size is 16/100 uncompressed. To focus on the cpu benefit of batching, the runs were single threaded and bound to the same cpu to eliminate interference from other system events. The results show a 10-25% improvement in micros/op from smaller to larger batch sizes (4 - 32).
Batch Sizes
1 | 2 | 4 | 8 | 16 | 32
Random pattern (Stride length 0)
4.158 | 4.109 | 4.026 | 4.05 | 4.1 | 4.074 - Get
4.438 | 4.302 | 4.165 | 4.122 | 4.096 | 4.075 - MultiGet (no batching)
4.461 | 4.256 | 4.277 | 4.11 | 4.182 | 4.14 - MultiGet (w/ batching)
Good locality (Stride length 16)
4.048 | 3.659 | 3.248 | 2.99 | 2.84 | 2.753
4.429 | 3.728 | 3.406 | 3.053 | 2.911 | 2.781
4.452 | 3.45 | 2.833 | 2.451 | 2.233 | 2.135
Good locality (Stride length 256)
4.066 | 3.786 | 3.581 | 3.447 | 3.415 | 3.232
4.406 | 4.005 | 3.644 | 3.49 | 3.381 | 3.268
4.393 | 3.649 | 3.186 | 2.882 | 2.676 | 2.62
Medium locality (Stride length 4096)
4.012 | 3.922 | 3.768 | 3.61 | 3.582 | 3.555
4.364 | 4.057 | 3.791 | 3.65 | 3.57 | 3.465
4.479 | 3.758 | 3.316 | 3.077 | 2.959 | 2.891
dbbench command used (on a DB with 4 levels, 12 million keys)-
TEST_TMPDIR=/dev/shm numactl -C 10 ./db_bench.tmp -use_existing_db=true -benchmarks="readseq,multireadrandom" -write_buffer_size=4194304 -target_file_size_base=4194304 -max_bytes_for_level_base=16777216 -num=12000000 -reads=12000000 -duration=90 -threads=1 -compression_type=none -cache_size=4194304000 -batch_size=32 -disable_auto_compactions=true -bloom_bits=10 -cache_index_and_filter_blocks=true -pin_l0_filter_and_index_blocks_in_cache=true -multiread_batched=true -multiread_stride=4
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5011
Differential Revision: D14348703
Pulled By: anand1976
fbshipit-source-id: 774406dab3776d979c809522a67bedac6c17f84b
2019-04-11 21:24:09 +00:00
|
|
|
#include <memory>
|
2024-02-07 02:35:36 +00:00
|
|
|
#include <optional>
|
2014-07-23 19:31:11 +00:00
|
|
|
#include <string>
|
2016-08-16 15:16:04 +00:00
|
|
|
#include <utility>
|
2021-09-29 11:01:57 +00:00
|
|
|
|
2013-08-23 15:38:13 +00:00
|
|
|
#include "rocksdb/comparator.h"
|
|
|
|
#include "rocksdb/slice.h"
|
2014-04-10 21:19:43 +00:00
|
|
|
#include "rocksdb/slice_transform.h"
|
2013-08-23 15:38:13 +00:00
|
|
|
#include "rocksdb/types.h"
|
2011-03-18 22:37:00 +00:00
|
|
|
#include "util/coding.h"
|
2019-03-27 17:24:16 +00:00
|
|
|
#include "util/user_comparator_wrapper.h"
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2020-02-20 20:07:53 +00:00
|
|
|
namespace ROCKSDB_NAMESPACE {
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2019-05-23 23:16:38 +00:00
|
|
|
// The file declares data structures and functions that deal with internal
|
|
|
|
// keys.
|
|
|
|
// Each internal key contains a user key, a sequence number (SequenceNumber)
|
|
|
|
// and a type (ValueType), and they are usually encoded together.
|
|
|
|
// There are some related helper classes here.
|
|
|
|
|
2011-03-18 22:37:00 +00:00
|
|
|
class InternalKey;
|
|
|
|
|
|
|
|
// Value types encoded as the last component of internal keys.
|
|
|
|
// DO NOT CHANGE THESE ENUM VALUES: they are embedded in the on-disk
|
|
|
|
// data structures.
|
2014-01-27 21:53:22 +00:00
|
|
|
// The highest bit of the value type needs to be reserved to SST tables
|
|
|
|
// for them to do more flexible encoding.
|
|
|
|
enum ValueType : unsigned char {
|
2011-03-18 22:37:00 +00:00
|
|
|
kTypeDeletion = 0x0,
|
2013-03-21 22:59:47 +00:00
|
|
|
kTypeValue = 0x1,
|
2013-08-14 23:32:46 +00:00
|
|
|
kTypeMerge = 0x2,
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 18:42:56 +00:00
|
|
|
kTypeLogData = 0x3, // WAL only.
|
|
|
|
kTypeColumnFamilyDeletion = 0x4, // WAL only.
|
|
|
|
kTypeColumnFamilyValue = 0x5, // WAL only.
|
|
|
|
kTypeColumnFamilyMerge = 0x6, // WAL only.
|
|
|
|
kTypeSingleDeletion = 0x7,
|
|
|
|
kTypeColumnFamilySingleDeletion = 0x8, // WAL only.
|
Modification of WriteBatch to support two phase commit
Summary: Adds three new WriteBatch data types: Prepare(xid), Commit(xid), Rollback(xid). Prepare(xid) should precede the (single) operation to which is applies. There can obviously be multiple Prepare(xid) markers. There should only be one Rollback(xid) or Commit(xid) marker yet not both. None of this logic is currently enforced and will most likely be implemented further up such as in the memtableinserter. All three markers are similar to PutLogData in that they are writebatch meta-data, ie stored but not counted. All three markers differ from PutLogData in that they will actually be written to disk. As for WriteBatchWithIndex, Prepare, Commit, Rollback are all implemented just as PutLogData and none are tested just as PutLogData.
Test Plan: single unit test in write_batch_test.
Reviewers: hermanlee4, sdong, anthony
Subscribers: leveldb, dhruba, vasilep, andrewkr
Differential Revision: https://reviews.facebook.net/D57867
2016-04-08 06:35:51 +00:00
|
|
|
kTypeBeginPrepareXID = 0x9, // WAL only.
|
|
|
|
kTypeEndPrepareXID = 0xA, // WAL only.
|
|
|
|
kTypeCommitXID = 0xB, // WAL only.
|
|
|
|
kTypeRollbackXID = 0xC, // WAL only.
|
|
|
|
kTypeNoop = 0xD, // WAL only.
|
2016-08-16 15:16:04 +00:00
|
|
|
kTypeColumnFamilyRangeDeletion = 0xE, // WAL only.
|
|
|
|
kTypeRangeDeletion = 0xF, // meta block
|
2017-10-03 16:08:07 +00:00
|
|
|
kTypeColumnFamilyBlobIndex = 0x10, // Blob DB only
|
|
|
|
kTypeBlobIndex = 0x11, // Blob DB only
|
2017-11-11 19:23:43 +00:00
|
|
|
// When the prepared record is also persisted in db, we use a different
|
|
|
|
// record. This is to ensure that the WAL that is generated by a WritePolicy
|
|
|
|
// is not mistakenly read by another, which would result into data
|
|
|
|
// inconsistency.
|
|
|
|
kTypeBeginPersistedPrepareXID = 0x12, // WAL only.
|
2018-06-29 01:46:39 +00:00
|
|
|
// Similar to kTypeBeginPersistedPrepareXID, this is to ensure that WAL
|
|
|
|
// generated by WriteUnprepared write policy is not mistakenly read by
|
|
|
|
// another.
|
2018-07-07 00:17:36 +00:00
|
|
|
kTypeBeginUnprepareXID = 0x13, // WAL only.
|
2020-05-28 17:37:57 +00:00
|
|
|
kTypeDeletionWithTimestamp = 0x14,
|
2021-12-10 19:03:39 +00:00
|
|
|
kTypeCommitXIDAndTimestamp = 0x15, // WAL only
|
2022-06-21 01:04:08 +00:00
|
|
|
kTypeWideColumnEntity = 0x16,
|
Add initial support for TimedPut API (#12419)
Summary:
This PR adds support for `TimedPut` API. We introduced a new type `kTypeValuePreferredSeqno` for entries added to the DB via the `TimedPut` API.
The life cycle of such an entry on the write/flush/compaction paths are:
1) It is initially added to memtable as:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, write_unix_time}`
2) When it's flushed to L0 sst files, it's converted to:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, preferred_seqno}`
when we have easy access to the seqno to time mapping.
3) During compaction, if certain conditions are met, we swap in the `preferred_seqno` and the entry will become:
`<user_key, preferred_seqno, kTypeValue>: value`. This step helps fast track these entries to the cold tier if they are eligible after the sequence number swap.
On the read path:
A `kTypeValuePreferredSeqno` entry acts the same as a `kTypeValue` entry, the unix_write_time/preferred seqno part packed in value is completely ignored.
Needed follow ups:
1) The seqno to time mapping accessible in flush needs to be extended to cover the `write_unix_time` for possible `kTypeValuePreferredSeqno` entries. This also means we need to track these `write_unix_time` in memtable.
2) Compaction filter support for the new `kTypeValuePreferredSeqno` type for feature parity with other `kTypeValue` and equivalent types.
3) Stress test coverage for the feature
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12419
Test Plan: Added unit tests
Reviewed By: pdillinger
Differential Revision: D54920296
Pulled By: jowlyzhang
fbshipit-source-id: c8b43f7a7c465e569141770e93c748371ff1da9e
2024-03-14 22:44:55 +00:00
|
|
|
kTypeColumnFamilyWideColumnEntity = 0x17, // WAL only
|
|
|
|
kTypeValuePreferredSeqno = 0x18, // Value with a unix write time
|
|
|
|
kTypeColumnFamilyValuePreferredSeqno = 0x19, // WAL only
|
2022-07-15 21:42:00 +00:00
|
|
|
kTypeMaxValid, // Should be after the last valid type, only used for
|
|
|
|
// validation
|
|
|
|
kMaxValue = 0x7F // Not used for storing records.
|
2011-03-18 22:37:00 +00:00
|
|
|
};
|
2014-01-27 21:53:22 +00:00
|
|
|
|
2016-09-28 01:20:57 +00:00
|
|
|
// Defined in dbformat.cc
|
|
|
|
extern const ValueType kValueTypeForSeek;
|
|
|
|
extern const ValueType kValueTypeForSeekForPrev;
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 18:42:56 +00:00
|
|
|
|
2024-02-07 02:35:36 +00:00
|
|
|
// A range of user keys used internally by RocksDB. Also see `Range` used by
|
|
|
|
// public APIs.
|
|
|
|
struct UserKeyRange {
|
|
|
|
// In case of user_defined timestamp, if enabled, `start` and `limit` should
|
|
|
|
// include user_defined timestamps.
|
|
|
|
Slice start;
|
|
|
|
Slice limit;
|
|
|
|
|
|
|
|
UserKeyRange() = default;
|
|
|
|
UserKeyRange(const Slice& s, const Slice& l) : start(s), limit(l) {}
|
|
|
|
};
|
|
|
|
|
|
|
|
// A range of user keys used internally by RocksDB. Also see `RangePtr` used by
|
|
|
|
// public APIs.
|
|
|
|
struct UserKeyRangePtr {
|
|
|
|
// In case of user_defined timestamp, if enabled, `start` and `limit` should
|
|
|
|
// point to key with timestamp part.
|
|
|
|
// An optional range start, if missing, indicating a start before all keys.
|
|
|
|
std::optional<Slice> start;
|
|
|
|
// An optional range end, if missing, indicating an end after all keys.
|
|
|
|
std::optional<Slice> limit;
|
|
|
|
|
|
|
|
UserKeyRangePtr(const std::optional<Slice>& s, const std::optional<Slice>& l)
|
|
|
|
: start(s), limit(l) {}
|
|
|
|
};
|
|
|
|
|
2016-08-16 15:16:04 +00:00
|
|
|
// Checks whether a type is an inline value type
|
|
|
|
// (i.e. a type used in memtable skiplist and sst file datablock).
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 18:42:56 +00:00
|
|
|
inline bool IsValueType(ValueType t) {
|
2022-06-21 01:04:08 +00:00
|
|
|
return t <= kTypeMerge || kTypeSingleDeletion == t || kTypeBlobIndex == t ||
|
Add initial support for TimedPut API (#12419)
Summary:
This PR adds support for `TimedPut` API. We introduced a new type `kTypeValuePreferredSeqno` for entries added to the DB via the `TimedPut` API.
The life cycle of such an entry on the write/flush/compaction paths are:
1) It is initially added to memtable as:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, write_unix_time}`
2) When it's flushed to L0 sst files, it's converted to:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, preferred_seqno}`
when we have easy access to the seqno to time mapping.
3) During compaction, if certain conditions are met, we swap in the `preferred_seqno` and the entry will become:
`<user_key, preferred_seqno, kTypeValue>: value`. This step helps fast track these entries to the cold tier if they are eligible after the sequence number swap.
On the read path:
A `kTypeValuePreferredSeqno` entry acts the same as a `kTypeValue` entry, the unix_write_time/preferred seqno part packed in value is completely ignored.
Needed follow ups:
1) The seqno to time mapping accessible in flush needs to be extended to cover the `write_unix_time` for possible `kTypeValuePreferredSeqno` entries. This also means we need to track these `write_unix_time` in memtable.
2) Compaction filter support for the new `kTypeValuePreferredSeqno` type for feature parity with other `kTypeValue` and equivalent types.
3) Stress test coverage for the feature
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12419
Test Plan: Added unit tests
Reviewed By: pdillinger
Differential Revision: D54920296
Pulled By: jowlyzhang
fbshipit-source-id: c8b43f7a7c465e569141770e93c748371ff1da9e
2024-03-14 22:44:55 +00:00
|
|
|
kTypeDeletionWithTimestamp == t || kTypeWideColumnEntity == t ||
|
|
|
|
kTypeValuePreferredSeqno == t;
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 18:42:56 +00:00
|
|
|
}
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2016-08-16 15:16:04 +00:00
|
|
|
// Checks whether a type is from user operation
|
|
|
|
// kTypeRangeDeletion is in meta block so this API is separated from above
|
2023-02-22 20:28:18 +00:00
|
|
|
// kTypeMaxValid can be from keys generated by
|
|
|
|
// TruncatedRangeDelIterator::start_key()
|
2016-08-16 15:16:04 +00:00
|
|
|
inline bool IsExtendedValueType(ValueType t) {
|
2023-02-22 20:28:18 +00:00
|
|
|
return IsValueType(t) || t == kTypeRangeDeletion || t == kTypeMaxValid;
|
2016-08-16 15:16:04 +00:00
|
|
|
}
|
|
|
|
|
2011-03-18 22:37:00 +00:00
|
|
|
// We leave eight bits empty at the bottom so a type and sequence#
|
|
|
|
// can be packed together into 64-bits.
|
2019-03-27 23:13:08 +00:00
|
|
|
static const SequenceNumber kMaxSequenceNumber = ((0x1ull << 56) - 1);
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2022-05-05 20:08:21 +00:00
|
|
|
static const SequenceNumber kDisableGlobalSequenceNumber =
|
|
|
|
std::numeric_limits<uint64_t>::max();
|
2016-10-18 23:59:37 +00:00
|
|
|
|
2020-10-01 17:08:52 +00:00
|
|
|
constexpr uint64_t kNumInternalBytes = 8;
|
|
|
|
|
2021-11-10 18:47:53 +00:00
|
|
|
// Defined in dbformat.cc
|
|
|
|
extern const std::string kDisableUserTimestamp;
|
|
|
|
|
2019-05-23 23:16:38 +00:00
|
|
|
// The data structure that represents an internal key in the way that user_key,
|
|
|
|
// sequence number and type are stored in separated forms.
|
2011-03-18 22:37:00 +00:00
|
|
|
struct ParsedInternalKey {
|
|
|
|
Slice user_key;
|
|
|
|
SequenceNumber sequence;
|
|
|
|
ValueType type;
|
|
|
|
|
2017-06-28 22:36:11 +00:00
|
|
|
ParsedInternalKey()
|
2020-10-01 02:15:42 +00:00
|
|
|
: sequence(kMaxSequenceNumber),
|
|
|
|
type(kTypeDeletion) // Make code analyzer happy
|
|
|
|
{} // Intentionally left uninitialized (for speed)
|
2020-03-07 00:21:03 +00:00
|
|
|
// u contains timestamp if user timestamp feature is enabled.
|
2011-03-18 22:37:00 +00:00
|
|
|
ParsedInternalKey(const Slice& u, const SequenceNumber& seq, ValueType t)
|
2019-03-27 23:13:08 +00:00
|
|
|
: user_key(u), sequence(seq), type(t) {}
|
2020-10-28 17:11:13 +00:00
|
|
|
std::string DebugString(bool log_err_key, bool hex) const;
|
2017-07-24 18:28:20 +00:00
|
|
|
|
|
|
|
void clear() {
|
|
|
|
user_key.clear();
|
|
|
|
sequence = 0;
|
|
|
|
type = kTypeDeletion;
|
|
|
|
}
|
Allow compaction iterator to perform garbage collection (#7556)
Summary:
Add a threshold timestamp, full_history_ts_low_ of type `std::string*` to
`CompactionIterator`, so that RocksDB can also perform garbage collection during
compaction.
* If full_history_ts_low_ is nullptr, then compaction iterator does not perform
GC, preserving all timestamp history for all keys. Compaction iterator will
treat user key with different timestamps as different user keys.
* If full_history_ts_low_ is not nullptr, then compaction iterator performs
GC. GC will look at keys older than `*full_history_ts_low_` and determine their
eligibility based on factors including snapshots.
Current rules of GC:
* If an internal key is in the same snapshot as a previous counterpart
with the same user key, and this key is eligible for GC, and the key is
not single-delete or merge operand, then this key can be dropped. Note
that the previous internal key cannot be a merge operand either.
* If a tombstone is the most recent one in the earliest snapshot and it
is eligible for GC, and keyNotExistsBeyondLevel() is true, then this
tombstone can be dropped.
* If a tombstone is the most recent one in a snapshot and it is eligible
for GC, and the compaction is at bottommost level, then all other older
internal keys of the same user key must also be eligible for GC, thus
can be dropped
* Single-delete, delete-range and merge are not currently supported.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7556
Test Plan: make check
Reviewed By: ltamasi
Differential Revision: D24507728
Pulled By: riversand963
fbshipit-source-id: 3c09c7301f41eed76dfcf4d1527e68cf6e0a8bb3
2020-10-24 05:58:05 +00:00
|
|
|
|
|
|
|
void SetTimestamp(const Slice& ts) {
|
|
|
|
assert(ts.size() <= user_key.size());
|
2021-03-10 19:13:55 +00:00
|
|
|
const char* addr = user_key.data() + user_key.size() - ts.size();
|
Allow compaction iterator to perform garbage collection (#7556)
Summary:
Add a threshold timestamp, full_history_ts_low_ of type `std::string*` to
`CompactionIterator`, so that RocksDB can also perform garbage collection during
compaction.
* If full_history_ts_low_ is nullptr, then compaction iterator does not perform
GC, preserving all timestamp history for all keys. Compaction iterator will
treat user key with different timestamps as different user keys.
* If full_history_ts_low_ is not nullptr, then compaction iterator performs
GC. GC will look at keys older than `*full_history_ts_low_` and determine their
eligibility based on factors including snapshots.
Current rules of GC:
* If an internal key is in the same snapshot as a previous counterpart
with the same user key, and this key is eligible for GC, and the key is
not single-delete or merge operand, then this key can be dropped. Note
that the previous internal key cannot be a merge operand either.
* If a tombstone is the most recent one in the earliest snapshot and it
is eligible for GC, and keyNotExistsBeyondLevel() is true, then this
tombstone can be dropped.
* If a tombstone is the most recent one in a snapshot and it is eligible
for GC, and the compaction is at bottommost level, then all other older
internal keys of the same user key must also be eligible for GC, thus
can be dropped
* Single-delete, delete-range and merge are not currently supported.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7556
Test Plan: make check
Reviewed By: ltamasi
Differential Revision: D24507728
Pulled By: riversand963
fbshipit-source-id: 3c09c7301f41eed76dfcf4d1527e68cf6e0a8bb3
2020-10-24 05:58:05 +00:00
|
|
|
memcpy(const_cast<char*>(addr), ts.data(), ts.size());
|
|
|
|
}
|
2022-11-01 05:28:58 +00:00
|
|
|
|
|
|
|
Slice GetTimestamp(size_t ts_sz) {
|
|
|
|
assert(ts_sz <= user_key.size());
|
|
|
|
const char* addr = user_key.data() + user_key.size() - ts_sz;
|
|
|
|
return Slice(const_cast<char*>(addr), ts_sz);
|
|
|
|
}
|
2011-03-18 22:37:00 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
// Return the length of the encoding of "key".
|
|
|
|
inline size_t InternalKeyEncodingLength(const ParsedInternalKey& key) {
|
2020-10-01 17:08:52 +00:00
|
|
|
return key.user_key.size() + kNumInternalBytes;
|
2011-03-18 22:37:00 +00:00
|
|
|
}
|
|
|
|
|
2015-05-29 21:36:35 +00:00
|
|
|
// Pack a sequence number and a ValueType into a uint64_t
|
2020-07-08 00:25:08 +00:00
|
|
|
inline uint64_t PackSequenceAndType(uint64_t seq, ValueType t) {
|
|
|
|
assert(seq <= kMaxSequenceNumber);
|
Skip swaths of range tombstone covered keys in merging iterator (2022 edition) (#10449)
Summary:
Delete range logic is moved from `DBIter` to `MergingIterator`, and `MergingIterator` will seek to the end of a range deletion if possible instead of scanning through each key and check with `RangeDelAggregator`.
With the invariant that a key in level L (consider memtable as the first level, each immutable and L0 as a separate level) has a larger sequence number than all keys in any level >L, a range tombstone `[start, end)` from level L covers all keys in its range in any level >L. This property motivates optimizations in iterator:
- in `Seek(target)`, if level L has a range tombstone `[start, end)` that covers `target.UserKey`, then for all levels > L, we can do Seek() on `end` instead of `target` to skip some range tombstone covered keys.
- in `Next()/Prev()`, if the current key is covered by a range tombstone `[start, end)` from level L, we can do `Seek` to `end` for all levels > L.
This PR implements the above optimizations in `MergingIterator`. As all range tombstone covered keys are now skipped in `MergingIterator`, the range tombstone logic is removed from `DBIter`. The idea in this PR is similar to https://github.com/facebook/rocksdb/issues/7317, but this PR leaves `InternalIterator` interface mostly unchanged. **Credit**: the cascading seek optimization and the sentinel key (discussed below) are inspired by [Pebble](https://github.com/cockroachdb/pebble/blob/master/merging_iter.go) and suggested by ajkr in https://github.com/facebook/rocksdb/issues/7317. The two optimizations are mostly implemented in `SeekImpl()/SeekForPrevImpl()` and `IsNextDeleted()/IsPrevDeleted()` in `merging_iterator.cc`. See comments for each method for more detail.
One notable change is that the minHeap/maxHeap used by `MergingIterator` now contains range tombstone end keys besides point key iterators. This helps to reduce the number of key comparisons. For example, for a range tombstone `[start, end)`, a `start` and an `end` `HeapItem` are inserted into the heap. When a `HeapItem` for range tombstone start key is popped from the minHeap, we know this range tombstone becomes "active" in the sense that, before the range tombstone's end key is popped from the minHeap, all the keys popped from this heap is covered by the range tombstone's internal key range `[start, end)`.
Another major change, *delete range sentinel key*, is made to `LevelIterator`. Before this PR, when all point keys in an SST file are iterated through in `MergingIterator`, a level iterator would advance to the next SST file in its level. In the case when an SST file has a range tombstone that covers keys beyond the SST file's last point key, advancing to the next SST file would lose this range tombstone. Consequently, `MergingIterator` could return keys that should have been deleted by some range tombstone. We prevent this by pretending that file boundaries in each SST file are sentinel keys. A `LevelIterator` now only advance the file iterator once the sentinel key is processed.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10449
Test Plan:
- Added many unit tests in db_range_del_test
- Stress test: `./db_stress --readpercent=5 --prefixpercent=19 --writepercent=20 -delpercent=10 --iterpercent=44 --delrangepercent=2`
- Additional iterator stress test is added to verify against iterators against expected state: https://github.com/facebook/rocksdb/issues/10538. This is based on ajkr's previous attempt https://github.com/facebook/rocksdb/pull/5506#issuecomment-506021913.
```
python3 ./tools/db_crashtest.py blackbox --simple --write_buffer_size=524288 --target_file_size_base=524288 --max_bytes_for_level_base=2097152 --compression_type=none --max_background_compactions=8 --value_size_mult=33 --max_key=5000000 --interval=10 --duration=7200 --delrangepercent=3 --delpercent=9 --iterpercent=25 --writepercent=60 --readpercent=3 --prefixpercent=0 --num_iterations=1000 --range_deletion_width=100 --verify_iterator_with_expected_state_one_in=1
```
- Performance benchmark: I used a similar setup as in the blog [post](http://rocksdb.org/blog/2018/11/21/delete-range.html) that introduced DeleteRange, "a database with 5 million data keys, and 10000 range tombstones (ignoring those dropped during compaction) that were written in regular intervals after 4.5 million data keys were written". As expected, the performance with this PR depends on the range tombstone width.
```
# Setup:
TEST_TMPDIR=/dev/shm ./db_bench_main --benchmarks=fillrandom --writes=4500000 --num=5000000
TEST_TMPDIR=/dev/shm ./db_bench_main --benchmarks=overwrite --writes=500000 --num=5000000 --use_existing_db=true --writes_per_range_tombstone=50
# Scan entire DB
TEST_TMPDIR=/dev/shm ./db_bench_main --benchmarks=readseq[-X5] --use_existing_db=true --num=5000000 --disable_auto_compactions=true
# Short range scan (10 Next())
TEST_TMPDIR=/dev/shm/width-100/ ./db_bench_main --benchmarks=seekrandom[-X5] --use_existing_db=true --num=500000 --reads=100000 --seek_nexts=10 --disable_auto_compactions=true
# Long range scan(1000 Next())
TEST_TMPDIR=/dev/shm/width-100/ ./db_bench_main --benchmarks=seekrandom[-X5] --use_existing_db=true --num=500000 --reads=2500 --seek_nexts=1000 --disable_auto_compactions=true
```
Avg over of 10 runs (some slower tests had fews runs):
For the first column (tombstone), 0 means no range tombstone, 100-10000 means width of the 10k range tombstones, and 1 means there is a single range tombstone in the entire DB (width is 1000). The 1 tombstone case is to test regression when there's very few range tombstones in the DB, as no range tombstone is likely to take a different code path than with range tombstones.
- Scan entire DB
| tombstone width | Pre-PR ops/sec | Post-PR ops/sec | ±% |
| ------------- | ------------- | ------------- | ------------- |
| 0 range tombstone |2525600 (± 43564) |2486917 (± 33698) |-1.53% |
| 100 |1853835 (± 24736) |2073884 (± 32176) |+11.87% |
| 1000 |422415 (± 7466) |1115801 (± 22781) |+164.15% |
| 10000 |22384 (± 227) |227919 (± 6647) |+918.22% |
| 1 range tombstone |2176540 (± 39050) |2434954 (± 24563) |+11.87% |
- Short range scan
| tombstone width | Pre-PR ops/sec | Post-PR ops/sec | ±% |
| ------------- | ------------- | ------------- | ------------- |
| 0 range tombstone |35398 (± 533) |35338 (± 569) |-0.17% |
| 100 |28276 (± 664) |31684 (± 331) |+12.05% |
| 1000 |7637 (± 77) |25422 (± 277) |+232.88% |
| 10000 |1367 |28667 |+1997.07% |
| 1 range tombstone |32618 (± 581) |32748 (± 506) |+0.4% |
- Long range scan
| tombstone width | Pre-PR ops/sec | Post-PR ops/sec | ±% |
| ------------- | ------------- | ------------- | ------------- |
| 0 range tombstone |2262 (± 33) |2353 (± 20) |+4.02% |
| 100 |1696 (± 26) |1926 (± 18) |+13.56% |
| 1000 |410 (± 6) |1255 (± 29) |+206.1% |
| 10000 |25 |414 |+1556.0% |
| 1 range tombstone |1957 (± 30) |2185 (± 44) |+11.65% |
- Microbench does not show significant regression: https://gist.github.com/cbi42/59f280f85a59b678e7e5d8561e693b61
Reviewed By: ajkr
Differential Revision: D38450331
Pulled By: cbi42
fbshipit-source-id: b5ef12e8d8c289ed2e163ccdf277f5039b511fca
2022-09-02 16:51:19 +00:00
|
|
|
// kTypeMaxValid is used in TruncatedRangeDelIterator, see its constructor.
|
|
|
|
assert(IsExtendedValueType(t) || t == kTypeMaxValid);
|
2020-07-08 00:25:08 +00:00
|
|
|
return (seq << 8) | t;
|
|
|
|
}
|
2014-04-01 21:45:30 +00:00
|
|
|
|
2015-05-29 21:36:35 +00:00
|
|
|
// Given the result of PackSequenceAndType, store the sequence number in *seq
|
|
|
|
// and the ValueType in *t.
|
2020-07-08 00:25:08 +00:00
|
|
|
inline void UnPackSequenceAndType(uint64_t packed, uint64_t* seq,
|
|
|
|
ValueType* t) {
|
|
|
|
*seq = packed >> 8;
|
|
|
|
*t = static_cast<ValueType>(packed & 0xff);
|
|
|
|
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
// Commented the following two assertions in order to test key-value checksum
|
|
|
|
// on corrupted keys without crashing ("DbKvChecksumTest").
|
|
|
|
// assert(*seq <= kMaxSequenceNumber);
|
|
|
|
// assert(IsExtendedValueType(*t));
|
2020-07-08 00:25:08 +00:00
|
|
|
}
|
2015-05-29 21:36:35 +00:00
|
|
|
|
2024-01-29 19:37:34 +00:00
|
|
|
const uint64_t kRangeTombstoneSentinel =
|
|
|
|
PackSequenceAndType(kMaxSequenceNumber, kTypeRangeDeletion);
|
|
|
|
|
2017-11-02 05:52:17 +00:00
|
|
|
EntryType GetEntryType(ValueType value_type);
|
|
|
|
|
2011-03-18 22:37:00 +00:00
|
|
|
// Append the serialization of "key" to *result.
|
2023-08-15 05:04:18 +00:00
|
|
|
//
|
|
|
|
// input [internal key]: <user_key | seqno + type>
|
|
|
|
// output before: empty
|
|
|
|
// output: <user_key | seqno + type>
|
Logically strip timestamp during flush (#11557)
Summary:
Logically strip the user-defined timestamp when L0 files are created during flush when `AdvancedColumnFamilyOptions.persist_user_defined_timestamps` is false. Logically stripping timestamp here means replacing the original user-defined timestamp with a mininum timestamp, which for now is hard coded to be all zeros bytes.
While working on this, I caught a missing piece on the `BlockBuilder` level for this feature. The current quick path `std::min(buffer_size, last_key_size)` needs a bit tweaking to work for this feature. When user-defined timestamp is stripped during block building, on writing first entry or right after resetting, `buffer` is empty and `buffer_size` is zero as usual. However, in follow-up writes, depending on the size of the stripped user-defined timestamp, and the size of the value, what's in `buffer` can sometimes be smaller than `last_key_size`, leading `std::min(buffer_size, last_key_size)` to truncate the `last_key`. Previous test doesn't caught the bug because in those tests, the size of the stripped user-defined timestamps bytes is smaller than the length of the value. In order to avoid the conditional operation, this PR changed the original trivial `std::min` operation into an arithmetic operation. Since this is a change in a hot and performance critical path, I did the following benchmark to check no observable regression is introduced.
```TEST_TMPDIR=/dev/shm/rocksdb1 ./db_bench -benchmarks=fillseq -memtablerep=vector -allow_concurrent_memtable_write=false -num=50000000```
Compiled with DEBUG_LEVEL=0
Test vs. control runs simulaneous for better accuracy, units = ops/sec
PR vs base:
Round 1: 350652 vs 349055
Round 2: 365733 vs 364308
Round 3: 355681 vs 354475
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11557
Test Plan:
New timestamp specific test added or existing tests augmented, both are parameterized with `UserDefinedTimestampTestMode`:
`UserDefinedTimestampTestMode::kNormal` -> UDT feature enabled, write / read with min timestamp
`UserDefinedTimestampTestMode::kStripUserDefinedTimestamps` -> UDT feature enabled, write / read with min timestamp, set Options.persist_user_defined_timestamps to false.
```
make all check
./db_wal_test --gtest_filter="*WithTimestamp*"
./flush_job_test --gtest_filter="*WithTimestamp*"
./repair_test --gtest_filter="*WithTimestamp*"
./block_based_table_reader_test
```
Reviewed By: pdillinger
Differential Revision: D47027664
Pulled By: jowlyzhang
fbshipit-source-id: e729193b6334dfc63aaa736d684d907a022571f5
2023-06-29 22:50:50 +00:00
|
|
|
void AppendInternalKey(std::string* result, const ParsedInternalKey& key);
|
2020-03-07 00:21:03 +00:00
|
|
|
|
|
|
|
// Append the serialization of "key" to *result, replacing the original
|
|
|
|
// timestamp with argument ts.
|
2023-08-15 05:04:18 +00:00
|
|
|
//
|
|
|
|
// input [internal key]: <user_provided_key | original_ts | seqno + type>
|
|
|
|
// output before: empty
|
|
|
|
// output after: <user_provided_key | ts | seqno + type>
|
Logically strip timestamp during flush (#11557)
Summary:
Logically strip the user-defined timestamp when L0 files are created during flush when `AdvancedColumnFamilyOptions.persist_user_defined_timestamps` is false. Logically stripping timestamp here means replacing the original user-defined timestamp with a mininum timestamp, which for now is hard coded to be all zeros bytes.
While working on this, I caught a missing piece on the `BlockBuilder` level for this feature. The current quick path `std::min(buffer_size, last_key_size)` needs a bit tweaking to work for this feature. When user-defined timestamp is stripped during block building, on writing first entry or right after resetting, `buffer` is empty and `buffer_size` is zero as usual. However, in follow-up writes, depending on the size of the stripped user-defined timestamp, and the size of the value, what's in `buffer` can sometimes be smaller than `last_key_size`, leading `std::min(buffer_size, last_key_size)` to truncate the `last_key`. Previous test doesn't caught the bug because in those tests, the size of the stripped user-defined timestamps bytes is smaller than the length of the value. In order to avoid the conditional operation, this PR changed the original trivial `std::min` operation into an arithmetic operation. Since this is a change in a hot and performance critical path, I did the following benchmark to check no observable regression is introduced.
```TEST_TMPDIR=/dev/shm/rocksdb1 ./db_bench -benchmarks=fillseq -memtablerep=vector -allow_concurrent_memtable_write=false -num=50000000```
Compiled with DEBUG_LEVEL=0
Test vs. control runs simulaneous for better accuracy, units = ops/sec
PR vs base:
Round 1: 350652 vs 349055
Round 2: 365733 vs 364308
Round 3: 355681 vs 354475
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11557
Test Plan:
New timestamp specific test added or existing tests augmented, both are parameterized with `UserDefinedTimestampTestMode`:
`UserDefinedTimestampTestMode::kNormal` -> UDT feature enabled, write / read with min timestamp
`UserDefinedTimestampTestMode::kStripUserDefinedTimestamps` -> UDT feature enabled, write / read with min timestamp, set Options.persist_user_defined_timestamps to false.
```
make all check
./db_wal_test --gtest_filter="*WithTimestamp*"
./flush_job_test --gtest_filter="*WithTimestamp*"
./repair_test --gtest_filter="*WithTimestamp*"
./block_based_table_reader_test
```
Reviewed By: pdillinger
Differential Revision: D47027664
Pulled By: jowlyzhang
fbshipit-source-id: e729193b6334dfc63aaa736d684d907a022571f5
2023-06-29 22:50:50 +00:00
|
|
|
void AppendInternalKeyWithDifferentTimestamp(std::string* result,
|
|
|
|
const ParsedInternalKey& key,
|
|
|
|
const Slice& ts);
|
2020-03-07 00:21:03 +00:00
|
|
|
|
2024-01-29 19:37:34 +00:00
|
|
|
// Append the user key to *result, replacing the original timestamp with
|
|
|
|
// argument ts.
|
|
|
|
//
|
|
|
|
// input [user key]: <user_provided_key | original_ts>
|
|
|
|
// output before: empty
|
|
|
|
// output after: <user_provided_key | ts>
|
|
|
|
void AppendUserKeyWithDifferentTimestamp(std::string* result, const Slice& key,
|
|
|
|
const Slice& ts);
|
|
|
|
|
2016-12-01 15:00:17 +00:00
|
|
|
// Serialized internal key consists of user key followed by footer.
|
|
|
|
// This function appends the footer to *result, assuming that *result already
|
|
|
|
// contains the user key at the end.
|
2023-08-15 05:04:18 +00:00
|
|
|
//
|
|
|
|
// output before: <user_key>
|
|
|
|
// output after: <user_key | seqno + type>
|
Logically strip timestamp during flush (#11557)
Summary:
Logically strip the user-defined timestamp when L0 files are created during flush when `AdvancedColumnFamilyOptions.persist_user_defined_timestamps` is false. Logically stripping timestamp here means replacing the original user-defined timestamp with a mininum timestamp, which for now is hard coded to be all zeros bytes.
While working on this, I caught a missing piece on the `BlockBuilder` level for this feature. The current quick path `std::min(buffer_size, last_key_size)` needs a bit tweaking to work for this feature. When user-defined timestamp is stripped during block building, on writing first entry or right after resetting, `buffer` is empty and `buffer_size` is zero as usual. However, in follow-up writes, depending on the size of the stripped user-defined timestamp, and the size of the value, what's in `buffer` can sometimes be smaller than `last_key_size`, leading `std::min(buffer_size, last_key_size)` to truncate the `last_key`. Previous test doesn't caught the bug because in those tests, the size of the stripped user-defined timestamps bytes is smaller than the length of the value. In order to avoid the conditional operation, this PR changed the original trivial `std::min` operation into an arithmetic operation. Since this is a change in a hot and performance critical path, I did the following benchmark to check no observable regression is introduced.
```TEST_TMPDIR=/dev/shm/rocksdb1 ./db_bench -benchmarks=fillseq -memtablerep=vector -allow_concurrent_memtable_write=false -num=50000000```
Compiled with DEBUG_LEVEL=0
Test vs. control runs simulaneous for better accuracy, units = ops/sec
PR vs base:
Round 1: 350652 vs 349055
Round 2: 365733 vs 364308
Round 3: 355681 vs 354475
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11557
Test Plan:
New timestamp specific test added or existing tests augmented, both are parameterized with `UserDefinedTimestampTestMode`:
`UserDefinedTimestampTestMode::kNormal` -> UDT feature enabled, write / read with min timestamp
`UserDefinedTimestampTestMode::kStripUserDefinedTimestamps` -> UDT feature enabled, write / read with min timestamp, set Options.persist_user_defined_timestamps to false.
```
make all check
./db_wal_test --gtest_filter="*WithTimestamp*"
./flush_job_test --gtest_filter="*WithTimestamp*"
./repair_test --gtest_filter="*WithTimestamp*"
./block_based_table_reader_test
```
Reviewed By: pdillinger
Differential Revision: D47027664
Pulled By: jowlyzhang
fbshipit-source-id: e729193b6334dfc63aaa736d684d907a022571f5
2023-06-29 22:50:50 +00:00
|
|
|
void AppendInternalKeyFooter(std::string* result, SequenceNumber s,
|
|
|
|
ValueType t);
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2020-12-02 20:59:23 +00:00
|
|
|
// Append the key and a minimal timestamp to *result
|
2023-08-15 05:04:18 +00:00
|
|
|
//
|
|
|
|
// input [user key without ts]: <user_provided_key>
|
|
|
|
// output before: empty
|
|
|
|
// output after: <user_provided_key | min_ts>
|
Logically strip timestamp during flush (#11557)
Summary:
Logically strip the user-defined timestamp when L0 files are created during flush when `AdvancedColumnFamilyOptions.persist_user_defined_timestamps` is false. Logically stripping timestamp here means replacing the original user-defined timestamp with a mininum timestamp, which for now is hard coded to be all zeros bytes.
While working on this, I caught a missing piece on the `BlockBuilder` level for this feature. The current quick path `std::min(buffer_size, last_key_size)` needs a bit tweaking to work for this feature. When user-defined timestamp is stripped during block building, on writing first entry or right after resetting, `buffer` is empty and `buffer_size` is zero as usual. However, in follow-up writes, depending on the size of the stripped user-defined timestamp, and the size of the value, what's in `buffer` can sometimes be smaller than `last_key_size`, leading `std::min(buffer_size, last_key_size)` to truncate the `last_key`. Previous test doesn't caught the bug because in those tests, the size of the stripped user-defined timestamps bytes is smaller than the length of the value. In order to avoid the conditional operation, this PR changed the original trivial `std::min` operation into an arithmetic operation. Since this is a change in a hot and performance critical path, I did the following benchmark to check no observable regression is introduced.
```TEST_TMPDIR=/dev/shm/rocksdb1 ./db_bench -benchmarks=fillseq -memtablerep=vector -allow_concurrent_memtable_write=false -num=50000000```
Compiled with DEBUG_LEVEL=0
Test vs. control runs simulaneous for better accuracy, units = ops/sec
PR vs base:
Round 1: 350652 vs 349055
Round 2: 365733 vs 364308
Round 3: 355681 vs 354475
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11557
Test Plan:
New timestamp specific test added or existing tests augmented, both are parameterized with `UserDefinedTimestampTestMode`:
`UserDefinedTimestampTestMode::kNormal` -> UDT feature enabled, write / read with min timestamp
`UserDefinedTimestampTestMode::kStripUserDefinedTimestamps` -> UDT feature enabled, write / read with min timestamp, set Options.persist_user_defined_timestamps to false.
```
make all check
./db_wal_test --gtest_filter="*WithTimestamp*"
./flush_job_test --gtest_filter="*WithTimestamp*"
./repair_test --gtest_filter="*WithTimestamp*"
./block_based_table_reader_test
```
Reviewed By: pdillinger
Differential Revision: D47027664
Pulled By: jowlyzhang
fbshipit-source-id: e729193b6334dfc63aaa736d684d907a022571f5
2023-06-29 22:50:50 +00:00
|
|
|
void AppendKeyWithMinTimestamp(std::string* result, const Slice& key,
|
|
|
|
size_t ts_sz);
|
2020-12-02 20:59:23 +00:00
|
|
|
|
|
|
|
// Append the key and a maximal timestamp to *result
|
2023-08-15 05:04:18 +00:00
|
|
|
//
|
|
|
|
// input [user key without ts]: <user_provided_key>
|
|
|
|
// output before: empty
|
|
|
|
// output after: <user_provided_key | max_ts>
|
Logically strip timestamp during flush (#11557)
Summary:
Logically strip the user-defined timestamp when L0 files are created during flush when `AdvancedColumnFamilyOptions.persist_user_defined_timestamps` is false. Logically stripping timestamp here means replacing the original user-defined timestamp with a mininum timestamp, which for now is hard coded to be all zeros bytes.
While working on this, I caught a missing piece on the `BlockBuilder` level for this feature. The current quick path `std::min(buffer_size, last_key_size)` needs a bit tweaking to work for this feature. When user-defined timestamp is stripped during block building, on writing first entry or right after resetting, `buffer` is empty and `buffer_size` is zero as usual. However, in follow-up writes, depending on the size of the stripped user-defined timestamp, and the size of the value, what's in `buffer` can sometimes be smaller than `last_key_size`, leading `std::min(buffer_size, last_key_size)` to truncate the `last_key`. Previous test doesn't caught the bug because in those tests, the size of the stripped user-defined timestamps bytes is smaller than the length of the value. In order to avoid the conditional operation, this PR changed the original trivial `std::min` operation into an arithmetic operation. Since this is a change in a hot and performance critical path, I did the following benchmark to check no observable regression is introduced.
```TEST_TMPDIR=/dev/shm/rocksdb1 ./db_bench -benchmarks=fillseq -memtablerep=vector -allow_concurrent_memtable_write=false -num=50000000```
Compiled with DEBUG_LEVEL=0
Test vs. control runs simulaneous for better accuracy, units = ops/sec
PR vs base:
Round 1: 350652 vs 349055
Round 2: 365733 vs 364308
Round 3: 355681 vs 354475
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11557
Test Plan:
New timestamp specific test added or existing tests augmented, both are parameterized with `UserDefinedTimestampTestMode`:
`UserDefinedTimestampTestMode::kNormal` -> UDT feature enabled, write / read with min timestamp
`UserDefinedTimestampTestMode::kStripUserDefinedTimestamps` -> UDT feature enabled, write / read with min timestamp, set Options.persist_user_defined_timestamps to false.
```
make all check
./db_wal_test --gtest_filter="*WithTimestamp*"
./flush_job_test --gtest_filter="*WithTimestamp*"
./repair_test --gtest_filter="*WithTimestamp*"
./block_based_table_reader_test
```
Reviewed By: pdillinger
Differential Revision: D47027664
Pulled By: jowlyzhang
fbshipit-source-id: e729193b6334dfc63aaa736d684d907a022571f5
2023-06-29 22:50:50 +00:00
|
|
|
void AppendKeyWithMaxTimestamp(std::string* result, const Slice& key,
|
|
|
|
size_t ts_sz);
|
2020-12-02 20:59:23 +00:00
|
|
|
|
2023-08-15 05:04:18 +00:00
|
|
|
// `key` is a user key with timestamp. Append the user key without timestamp
|
|
|
|
// and the minimum timestamp to *result.
|
|
|
|
//
|
|
|
|
// input [user key]: <user_provided_key | original_ts>
|
|
|
|
// output before: empty
|
|
|
|
// output after: <user_provided_key | min_ts>
|
|
|
|
void AppendUserKeyWithMinTimestamp(std::string* result, const Slice& key,
|
|
|
|
size_t ts_sz);
|
|
|
|
|
2022-09-30 23:13:03 +00:00
|
|
|
// `key` is a user key with timestamp. Append the user key without timestamp
|
|
|
|
// and the maximal timestamp to *result.
|
2023-08-15 05:04:18 +00:00
|
|
|
//
|
|
|
|
// input [user key]: <user_provided_key | original_ts>
|
|
|
|
// output before: empty
|
|
|
|
// output after: <user_provided_key | max_ts>
|
Logically strip timestamp during flush (#11557)
Summary:
Logically strip the user-defined timestamp when L0 files are created during flush when `AdvancedColumnFamilyOptions.persist_user_defined_timestamps` is false. Logically stripping timestamp here means replacing the original user-defined timestamp with a mininum timestamp, which for now is hard coded to be all zeros bytes.
While working on this, I caught a missing piece on the `BlockBuilder` level for this feature. The current quick path `std::min(buffer_size, last_key_size)` needs a bit tweaking to work for this feature. When user-defined timestamp is stripped during block building, on writing first entry or right after resetting, `buffer` is empty and `buffer_size` is zero as usual. However, in follow-up writes, depending on the size of the stripped user-defined timestamp, and the size of the value, what's in `buffer` can sometimes be smaller than `last_key_size`, leading `std::min(buffer_size, last_key_size)` to truncate the `last_key`. Previous test doesn't caught the bug because in those tests, the size of the stripped user-defined timestamps bytes is smaller than the length of the value. In order to avoid the conditional operation, this PR changed the original trivial `std::min` operation into an arithmetic operation. Since this is a change in a hot and performance critical path, I did the following benchmark to check no observable regression is introduced.
```TEST_TMPDIR=/dev/shm/rocksdb1 ./db_bench -benchmarks=fillseq -memtablerep=vector -allow_concurrent_memtable_write=false -num=50000000```
Compiled with DEBUG_LEVEL=0
Test vs. control runs simulaneous for better accuracy, units = ops/sec
PR vs base:
Round 1: 350652 vs 349055
Round 2: 365733 vs 364308
Round 3: 355681 vs 354475
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11557
Test Plan:
New timestamp specific test added or existing tests augmented, both are parameterized with `UserDefinedTimestampTestMode`:
`UserDefinedTimestampTestMode::kNormal` -> UDT feature enabled, write / read with min timestamp
`UserDefinedTimestampTestMode::kStripUserDefinedTimestamps` -> UDT feature enabled, write / read with min timestamp, set Options.persist_user_defined_timestamps to false.
```
make all check
./db_wal_test --gtest_filter="*WithTimestamp*"
./flush_job_test --gtest_filter="*WithTimestamp*"
./repair_test --gtest_filter="*WithTimestamp*"
./block_based_table_reader_test
```
Reviewed By: pdillinger
Differential Revision: D47027664
Pulled By: jowlyzhang
fbshipit-source-id: e729193b6334dfc63aaa736d684d907a022571f5
2023-06-29 22:50:50 +00:00
|
|
|
void AppendUserKeyWithMaxTimestamp(std::string* result, const Slice& key,
|
|
|
|
size_t ts_sz);
|
2022-09-30 23:13:03 +00:00
|
|
|
|
2023-05-25 22:41:32 +00:00
|
|
|
// `key` is an internal key containing a user key without timestamp. Create a
|
|
|
|
// new key in *result by padding a min timestamp of size `ts_sz` to the user key
|
|
|
|
// and copying the remaining internal key bytes.
|
2023-08-15 05:04:18 +00:00
|
|
|
//
|
|
|
|
// input [internal key]: <user_provided_key | seqno + type>
|
|
|
|
// output before: empty
|
|
|
|
// output after: <user_provided_key | min_ts | seqno + type>
|
Logically strip timestamp during flush (#11557)
Summary:
Logically strip the user-defined timestamp when L0 files are created during flush when `AdvancedColumnFamilyOptions.persist_user_defined_timestamps` is false. Logically stripping timestamp here means replacing the original user-defined timestamp with a mininum timestamp, which for now is hard coded to be all zeros bytes.
While working on this, I caught a missing piece on the `BlockBuilder` level for this feature. The current quick path `std::min(buffer_size, last_key_size)` needs a bit tweaking to work for this feature. When user-defined timestamp is stripped during block building, on writing first entry or right after resetting, `buffer` is empty and `buffer_size` is zero as usual. However, in follow-up writes, depending on the size of the stripped user-defined timestamp, and the size of the value, what's in `buffer` can sometimes be smaller than `last_key_size`, leading `std::min(buffer_size, last_key_size)` to truncate the `last_key`. Previous test doesn't caught the bug because in those tests, the size of the stripped user-defined timestamps bytes is smaller than the length of the value. In order to avoid the conditional operation, this PR changed the original trivial `std::min` operation into an arithmetic operation. Since this is a change in a hot and performance critical path, I did the following benchmark to check no observable regression is introduced.
```TEST_TMPDIR=/dev/shm/rocksdb1 ./db_bench -benchmarks=fillseq -memtablerep=vector -allow_concurrent_memtable_write=false -num=50000000```
Compiled with DEBUG_LEVEL=0
Test vs. control runs simulaneous for better accuracy, units = ops/sec
PR vs base:
Round 1: 350652 vs 349055
Round 2: 365733 vs 364308
Round 3: 355681 vs 354475
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11557
Test Plan:
New timestamp specific test added or existing tests augmented, both are parameterized with `UserDefinedTimestampTestMode`:
`UserDefinedTimestampTestMode::kNormal` -> UDT feature enabled, write / read with min timestamp
`UserDefinedTimestampTestMode::kStripUserDefinedTimestamps` -> UDT feature enabled, write / read with min timestamp, set Options.persist_user_defined_timestamps to false.
```
make all check
./db_wal_test --gtest_filter="*WithTimestamp*"
./flush_job_test --gtest_filter="*WithTimestamp*"
./repair_test --gtest_filter="*WithTimestamp*"
./block_based_table_reader_test
```
Reviewed By: pdillinger
Differential Revision: D47027664
Pulled By: jowlyzhang
fbshipit-source-id: e729193b6334dfc63aaa736d684d907a022571f5
2023-06-29 22:50:50 +00:00
|
|
|
void PadInternalKeyWithMinTimestamp(std::string* result, const Slice& key,
|
|
|
|
size_t ts_sz);
|
2023-05-25 22:41:32 +00:00
|
|
|
|
2024-01-29 19:37:34 +00:00
|
|
|
// `key` is an internal key containing a user key without timestamp. Create a
|
|
|
|
// new key in *result by padding a max timestamp of size `ts_sz` to the user key
|
|
|
|
// and copying the remaining internal key bytes.
|
|
|
|
//
|
|
|
|
// input [internal key]: <user_provided_key | seqno + type>
|
|
|
|
// output before: empty
|
|
|
|
// output after: <user_provided_key | max_ts | seqno + type>
|
|
|
|
void PadInternalKeyWithMaxTimestamp(std::string* result, const Slice& key,
|
|
|
|
size_t ts_sz);
|
|
|
|
|
2023-05-25 22:41:32 +00:00
|
|
|
// `key` is an internal key containing a user key with timestamp of size
|
|
|
|
// `ts_sz`. Create a new internal key in *result by stripping the timestamp from
|
|
|
|
// the user key and copying the remaining internal key bytes.
|
2023-08-15 05:04:18 +00:00
|
|
|
//
|
|
|
|
// input [internal key]: <user_provided_key | original_ts | seqno + type>
|
|
|
|
// output before: empty
|
|
|
|
// output after: <user_provided_key | seqno + type>
|
Logically strip timestamp during flush (#11557)
Summary:
Logically strip the user-defined timestamp when L0 files are created during flush when `AdvancedColumnFamilyOptions.persist_user_defined_timestamps` is false. Logically stripping timestamp here means replacing the original user-defined timestamp with a mininum timestamp, which for now is hard coded to be all zeros bytes.
While working on this, I caught a missing piece on the `BlockBuilder` level for this feature. The current quick path `std::min(buffer_size, last_key_size)` needs a bit tweaking to work for this feature. When user-defined timestamp is stripped during block building, on writing first entry or right after resetting, `buffer` is empty and `buffer_size` is zero as usual. However, in follow-up writes, depending on the size of the stripped user-defined timestamp, and the size of the value, what's in `buffer` can sometimes be smaller than `last_key_size`, leading `std::min(buffer_size, last_key_size)` to truncate the `last_key`. Previous test doesn't caught the bug because in those tests, the size of the stripped user-defined timestamps bytes is smaller than the length of the value. In order to avoid the conditional operation, this PR changed the original trivial `std::min` operation into an arithmetic operation. Since this is a change in a hot and performance critical path, I did the following benchmark to check no observable regression is introduced.
```TEST_TMPDIR=/dev/shm/rocksdb1 ./db_bench -benchmarks=fillseq -memtablerep=vector -allow_concurrent_memtable_write=false -num=50000000```
Compiled with DEBUG_LEVEL=0
Test vs. control runs simulaneous for better accuracy, units = ops/sec
PR vs base:
Round 1: 350652 vs 349055
Round 2: 365733 vs 364308
Round 3: 355681 vs 354475
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11557
Test Plan:
New timestamp specific test added or existing tests augmented, both are parameterized with `UserDefinedTimestampTestMode`:
`UserDefinedTimestampTestMode::kNormal` -> UDT feature enabled, write / read with min timestamp
`UserDefinedTimestampTestMode::kStripUserDefinedTimestamps` -> UDT feature enabled, write / read with min timestamp, set Options.persist_user_defined_timestamps to false.
```
make all check
./db_wal_test --gtest_filter="*WithTimestamp*"
./flush_job_test --gtest_filter="*WithTimestamp*"
./repair_test --gtest_filter="*WithTimestamp*"
./block_based_table_reader_test
```
Reviewed By: pdillinger
Differential Revision: D47027664
Pulled By: jowlyzhang
fbshipit-source-id: e729193b6334dfc63aaa736d684d907a022571f5
2023-06-29 22:50:50 +00:00
|
|
|
void StripTimestampFromInternalKey(std::string* result, const Slice& key,
|
|
|
|
size_t ts_sz);
|
|
|
|
|
|
|
|
// `key` is an internal key containing a user key with timestamp of size
|
|
|
|
// `ts_sz`. Create a new internal key in *result while replace the original
|
|
|
|
// timestamp with min timestamp.
|
2023-08-15 05:04:18 +00:00
|
|
|
//
|
|
|
|
// input [internal key]: <user_provided_key | original_ts | seqno + type>
|
|
|
|
// output before: empty
|
|
|
|
// output after: <user_provided_key | min_ts | seqno + type>
|
Logically strip timestamp during flush (#11557)
Summary:
Logically strip the user-defined timestamp when L0 files are created during flush when `AdvancedColumnFamilyOptions.persist_user_defined_timestamps` is false. Logically stripping timestamp here means replacing the original user-defined timestamp with a mininum timestamp, which for now is hard coded to be all zeros bytes.
While working on this, I caught a missing piece on the `BlockBuilder` level for this feature. The current quick path `std::min(buffer_size, last_key_size)` needs a bit tweaking to work for this feature. When user-defined timestamp is stripped during block building, on writing first entry or right after resetting, `buffer` is empty and `buffer_size` is zero as usual. However, in follow-up writes, depending on the size of the stripped user-defined timestamp, and the size of the value, what's in `buffer` can sometimes be smaller than `last_key_size`, leading `std::min(buffer_size, last_key_size)` to truncate the `last_key`. Previous test doesn't caught the bug because in those tests, the size of the stripped user-defined timestamps bytes is smaller than the length of the value. In order to avoid the conditional operation, this PR changed the original trivial `std::min` operation into an arithmetic operation. Since this is a change in a hot and performance critical path, I did the following benchmark to check no observable regression is introduced.
```TEST_TMPDIR=/dev/shm/rocksdb1 ./db_bench -benchmarks=fillseq -memtablerep=vector -allow_concurrent_memtable_write=false -num=50000000```
Compiled with DEBUG_LEVEL=0
Test vs. control runs simulaneous for better accuracy, units = ops/sec
PR vs base:
Round 1: 350652 vs 349055
Round 2: 365733 vs 364308
Round 3: 355681 vs 354475
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11557
Test Plan:
New timestamp specific test added or existing tests augmented, both are parameterized with `UserDefinedTimestampTestMode`:
`UserDefinedTimestampTestMode::kNormal` -> UDT feature enabled, write / read with min timestamp
`UserDefinedTimestampTestMode::kStripUserDefinedTimestamps` -> UDT feature enabled, write / read with min timestamp, set Options.persist_user_defined_timestamps to false.
```
make all check
./db_wal_test --gtest_filter="*WithTimestamp*"
./flush_job_test --gtest_filter="*WithTimestamp*"
./repair_test --gtest_filter="*WithTimestamp*"
./block_based_table_reader_test
```
Reviewed By: pdillinger
Differential Revision: D47027664
Pulled By: jowlyzhang
fbshipit-source-id: e729193b6334dfc63aaa736d684d907a022571f5
2023-06-29 22:50:50 +00:00
|
|
|
void ReplaceInternalKeyWithMinTimestamp(std::string* result, const Slice& key,
|
|
|
|
size_t ts_sz);
|
2023-05-25 22:41:32 +00:00
|
|
|
|
2011-03-18 22:37:00 +00:00
|
|
|
// Attempt to parse an internal key from "internal_key". On success,
|
|
|
|
// stores the parsed data in "*result", and returns true.
|
|
|
|
//
|
|
|
|
// On error, returns false, leaves "*result" in an undefined state.
|
Logically strip timestamp during flush (#11557)
Summary:
Logically strip the user-defined timestamp when L0 files are created during flush when `AdvancedColumnFamilyOptions.persist_user_defined_timestamps` is false. Logically stripping timestamp here means replacing the original user-defined timestamp with a mininum timestamp, which for now is hard coded to be all zeros bytes.
While working on this, I caught a missing piece on the `BlockBuilder` level for this feature. The current quick path `std::min(buffer_size, last_key_size)` needs a bit tweaking to work for this feature. When user-defined timestamp is stripped during block building, on writing first entry or right after resetting, `buffer` is empty and `buffer_size` is zero as usual. However, in follow-up writes, depending on the size of the stripped user-defined timestamp, and the size of the value, what's in `buffer` can sometimes be smaller than `last_key_size`, leading `std::min(buffer_size, last_key_size)` to truncate the `last_key`. Previous test doesn't caught the bug because in those tests, the size of the stripped user-defined timestamps bytes is smaller than the length of the value. In order to avoid the conditional operation, this PR changed the original trivial `std::min` operation into an arithmetic operation. Since this is a change in a hot and performance critical path, I did the following benchmark to check no observable regression is introduced.
```TEST_TMPDIR=/dev/shm/rocksdb1 ./db_bench -benchmarks=fillseq -memtablerep=vector -allow_concurrent_memtable_write=false -num=50000000```
Compiled with DEBUG_LEVEL=0
Test vs. control runs simulaneous for better accuracy, units = ops/sec
PR vs base:
Round 1: 350652 vs 349055
Round 2: 365733 vs 364308
Round 3: 355681 vs 354475
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11557
Test Plan:
New timestamp specific test added or existing tests augmented, both are parameterized with `UserDefinedTimestampTestMode`:
`UserDefinedTimestampTestMode::kNormal` -> UDT feature enabled, write / read with min timestamp
`UserDefinedTimestampTestMode::kStripUserDefinedTimestamps` -> UDT feature enabled, write / read with min timestamp, set Options.persist_user_defined_timestamps to false.
```
make all check
./db_wal_test --gtest_filter="*WithTimestamp*"
./flush_job_test --gtest_filter="*WithTimestamp*"
./repair_test --gtest_filter="*WithTimestamp*"
./block_based_table_reader_test
```
Reviewed By: pdillinger
Differential Revision: D47027664
Pulled By: jowlyzhang
fbshipit-source-id: e729193b6334dfc63aaa736d684d907a022571f5
2023-06-29 22:50:50 +00:00
|
|
|
Status ParseInternalKey(const Slice& internal_key, ParsedInternalKey* result,
|
|
|
|
bool log_err_key);
|
2011-03-18 22:37:00 +00:00
|
|
|
|
|
|
|
// Returns the user key portion of an internal key.
|
2023-08-15 05:04:18 +00:00
|
|
|
//
|
|
|
|
// input [internal key]: <user_key | seqno + type>
|
|
|
|
// output: <user_key>
|
2011-03-18 22:37:00 +00:00
|
|
|
inline Slice ExtractUserKey(const Slice& internal_key) {
|
2020-10-01 17:08:52 +00:00
|
|
|
assert(internal_key.size() >= kNumInternalBytes);
|
|
|
|
return Slice(internal_key.data(), internal_key.size() - kNumInternalBytes);
|
2011-03-18 22:37:00 +00:00
|
|
|
}
|
|
|
|
|
2023-08-15 05:04:18 +00:00
|
|
|
// input [internal key]: <user_provided_key | ts | seqno + type>
|
|
|
|
// output : <user_provided_key>
|
2019-06-06 06:07:28 +00:00
|
|
|
inline Slice ExtractUserKeyAndStripTimestamp(const Slice& internal_key,
|
|
|
|
size_t ts_sz) {
|
2022-02-09 17:49:35 +00:00
|
|
|
Slice ret = internal_key;
|
|
|
|
ret.remove_suffix(kNumInternalBytes + ts_sz);
|
|
|
|
return ret;
|
2019-06-06 06:07:28 +00:00
|
|
|
}
|
|
|
|
|
2023-08-15 05:04:18 +00:00
|
|
|
// input [user key]: <user_provided_key | ts>
|
|
|
|
// output: <user_provided_key>
|
2019-06-06 06:07:28 +00:00
|
|
|
inline Slice StripTimestampFromUserKey(const Slice& user_key, size_t ts_sz) {
|
2022-02-09 17:49:35 +00:00
|
|
|
Slice ret = user_key;
|
|
|
|
ret.remove_suffix(ts_sz);
|
|
|
|
return ret;
|
2019-06-06 06:07:28 +00:00
|
|
|
}
|
|
|
|
|
2023-08-15 05:04:18 +00:00
|
|
|
// input [user key]: <user_provided_key | ts>
|
|
|
|
// output: <ts>
|
2020-03-02 23:58:32 +00:00
|
|
|
inline Slice ExtractTimestampFromUserKey(const Slice& user_key, size_t ts_sz) {
|
|
|
|
assert(user_key.size() >= ts_sz);
|
|
|
|
return Slice(user_key.data() + user_key.size() - ts_sz, ts_sz);
|
|
|
|
}
|
|
|
|
|
2023-08-15 05:04:18 +00:00
|
|
|
// input [internal key]: <user_provided_key | ts | seqno + type>
|
|
|
|
// output: <ts>
|
2022-03-12 00:13:23 +00:00
|
|
|
inline Slice ExtractTimestampFromKey(const Slice& internal_key, size_t ts_sz) {
|
|
|
|
const size_t key_size = internal_key.size();
|
|
|
|
assert(key_size >= kNumInternalBytes + ts_sz);
|
|
|
|
return Slice(internal_key.data() + key_size - ts_sz - kNumInternalBytes,
|
|
|
|
ts_sz);
|
|
|
|
}
|
|
|
|
|
2023-08-15 05:04:18 +00:00
|
|
|
// input [internal key]: <user_provided_key | ts | seqno + type>
|
|
|
|
// output: <seqno + type>
|
2018-07-14 00:34:54 +00:00
|
|
|
inline uint64_t ExtractInternalKeyFooter(const Slice& internal_key) {
|
2020-10-01 17:08:52 +00:00
|
|
|
assert(internal_key.size() >= kNumInternalBytes);
|
2011-03-18 22:37:00 +00:00
|
|
|
const size_t n = internal_key.size();
|
2020-10-01 17:08:52 +00:00
|
|
|
return DecodeFixed64(internal_key.data() + n - kNumInternalBytes);
|
2018-07-14 00:34:54 +00:00
|
|
|
}
|
|
|
|
|
2023-08-15 05:04:18 +00:00
|
|
|
// input [internal key]: <user_provided_key | ts | seqno + type>
|
|
|
|
// output: <type>
|
2018-07-14 00:34:54 +00:00
|
|
|
inline ValueType ExtractValueType(const Slice& internal_key) {
|
|
|
|
uint64_t num = ExtractInternalKeyFooter(internal_key);
|
2011-03-18 22:37:00 +00:00
|
|
|
unsigned char c = num & 0xff;
|
|
|
|
return static_cast<ValueType>(c);
|
|
|
|
}
|
|
|
|
|
Support returning write unix time in iterator property (#12428)
Summary:
This PR adds support to return data's approximate unix write time in the iterator property API. The general implementation is:
1) If the entry comes from a SST file, the sequence number to time mapping recorded in that file's table properties will be used to deduce the entry's write time from its sequence number. If no such recording is available, `std::numeric_limits<uint64_t>::max()` is returned to indicate the write time is unknown except if the entry's sequence number is zero, in which case, 0 is returned. This also means that even if `preclude_last_level_data_seconds` and `preserve_internal_time_seconds` can be toggled off between DB reopens, as long as the SST file's table property has the mapping available, the entry's write time can be deduced and returned.
2) If the entry comes from memtable, we will use the DB's sequence number to write time mapping to do similar things. A copy of the DB's seqno to write time mapping is kept in SuperVersion to allow iterators to have lock free access. This also means a new `SuperVersion` is installed each time DB's seqno to time mapping updates, which is originally proposed by Peter in https://github.com/facebook/rocksdb/issues/11928 . Similarly, if the feature is not enabled, `std::numeric_limits<uint64_t>::max()` is returned to indicate the write time is unknown.
Needed follow up:
1) The write time for `kTypeValuePreferredSeqno` should be special cased, where it's already specified by the user, so we can directly return it.
2) Flush job can be updated to use DB's seqno to time mapping copy in the SuperVersion.
3) Handle the case when `TimedPut` is called with a write time that is `std::numeric_limits<uint64_t>::max()`. We can make it a regular `Put`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12428
Test Plan: Added unit test
Reviewed By: pdillinger
Differential Revision: D54967067
Pulled By: jowlyzhang
fbshipit-source-id: c795b1b7ec142e09e53f2ed3461cf719833cb37a
2024-03-15 22:37:37 +00:00
|
|
|
// input [internal key]: <user_provided_key | ts | seqno + type>
|
|
|
|
// output: <seqno>
|
|
|
|
inline SequenceNumber ExtractSequenceNumber(const Slice& internal_key) {
|
|
|
|
uint64_t num = ExtractInternalKeyFooter(internal_key);
|
|
|
|
return num >> 8;
|
|
|
|
}
|
|
|
|
|
2011-03-18 22:37:00 +00:00
|
|
|
// A comparator for internal keys that uses a specified comparator for
|
|
|
|
// the user key portion and breaks ties by decreasing sequence number.
|
2017-09-11 18:53:22 +00:00
|
|
|
class InternalKeyComparator
|
|
|
|
#ifdef NDEBUG
|
|
|
|
final
|
|
|
|
#endif
|
2022-07-14 17:09:31 +00:00
|
|
|
: public CompareInterface {
|
2011-03-18 22:37:00 +00:00
|
|
|
private:
|
2019-03-27 17:24:16 +00:00
|
|
|
UserComparatorWrapper user_comparator_;
|
2019-03-27 23:13:08 +00:00
|
|
|
|
2011-03-18 22:37:00 +00:00
|
|
|
public:
|
2020-07-08 00:25:08 +00:00
|
|
|
// `InternalKeyComparator`s constructed with the default constructor are not
|
|
|
|
// usable and will segfault on any attempt to use them for comparisons.
|
|
|
|
InternalKeyComparator() = default;
|
|
|
|
|
|
|
|
// @param named If true, assign a name to this comparator based on the
|
|
|
|
// underlying comparator's name. This involves an allocation and copy in
|
|
|
|
// this constructor to precompute the result of `Name()`. To avoid this
|
|
|
|
// overhead, set `named` to false. In that case, `Name()` will return a
|
|
|
|
// generic name that is non-specific to the underlying comparator.
|
2022-07-14 17:09:31 +00:00
|
|
|
explicit InternalKeyComparator(const Comparator* c) : user_comparator_(c) {}
|
2014-01-27 21:53:22 +00:00
|
|
|
virtual ~InternalKeyComparator() {}
|
2013-06-10 20:28:58 +00:00
|
|
|
|
2022-07-14 17:09:31 +00:00
|
|
|
int Compare(const Slice& a, const Slice& b) const override;
|
|
|
|
|
|
|
|
bool Equal(const Slice& a, const Slice& b) const {
|
|
|
|
// TODO Use user_comparator_.Equal(). Perhaps compare seqno before
|
|
|
|
// comparing the user key too.
|
|
|
|
return Compare(a, b) == 0;
|
|
|
|
}
|
|
|
|
|
2018-02-01 02:45:49 +00:00
|
|
|
// Same as Compare except that it excludes the value type from comparison
|
2022-07-14 17:09:31 +00:00
|
|
|
int CompareKeySeq(const Slice& a, const Slice& b) const;
|
2023-11-21 01:07:28 +00:00
|
|
|
int CompareKeySeq(const ParsedInternalKey& a, const Slice& b) const;
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2019-03-27 17:24:16 +00:00
|
|
|
const Comparator* user_comparator() const {
|
|
|
|
return user_comparator_.user_comparator();
|
|
|
|
}
|
2011-03-18 22:37:00 +00:00
|
|
|
|
|
|
|
int Compare(const InternalKey& a, const InternalKey& b) const;
|
2014-01-27 21:53:22 +00:00
|
|
|
int Compare(const ParsedInternalKey& a, const ParsedInternalKey& b) const;
|
2022-11-29 03:27:22 +00:00
|
|
|
int Compare(const Slice& a, const ParsedInternalKey& b) const;
|
|
|
|
int Compare(const ParsedInternalKey& a, const Slice& b) const;
|
2020-07-08 00:25:08 +00:00
|
|
|
// In this `Compare()` overload, the sequence numbers provided in
|
|
|
|
// `a_global_seqno` and `b_global_seqno` override the sequence numbers in `a`
|
|
|
|
// and `b`, respectively. To disable sequence number override(s), provide the
|
|
|
|
// value `kDisableGlobalSequenceNumber`.
|
|
|
|
int Compare(const Slice& a, SequenceNumber a_global_seqno, const Slice& b,
|
|
|
|
SequenceNumber b_global_seqno) const;
|
2011-03-18 22:37:00 +00:00
|
|
|
};
|
|
|
|
|
2019-05-23 23:16:38 +00:00
|
|
|
// The class represent the internal key in encoded form.
|
2011-03-18 22:37:00 +00:00
|
|
|
class InternalKey {
|
|
|
|
private:
|
|
|
|
std::string rep_;
|
2019-03-27 23:13:08 +00:00
|
|
|
|
2011-03-18 22:37:00 +00:00
|
|
|
public:
|
2019-03-27 23:13:08 +00:00
|
|
|
InternalKey() {} // Leave rep_ as empty to indicate it is invalid
|
2014-11-06 19:14:28 +00:00
|
|
|
InternalKey(const Slice& _user_key, SequenceNumber s, ValueType t) {
|
|
|
|
AppendInternalKey(&rep_, ParsedInternalKey(_user_key, s, t));
|
2011-03-18 22:37:00 +00:00
|
|
|
}
|
2022-09-30 23:13:03 +00:00
|
|
|
InternalKey(const Slice& _user_key, SequenceNumber s, ValueType t, Slice ts) {
|
|
|
|
AppendInternalKeyWithDifferentTimestamp(
|
|
|
|
&rep_, ParsedInternalKey(_user_key, s, t), ts);
|
|
|
|
}
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2015-04-24 01:08:37 +00:00
|
|
|
// sets the internal key to be bigger or equal to all internal keys with this
|
|
|
|
// user key
|
|
|
|
void SetMaxPossibleForUserKey(const Slice& _user_key) {
|
2017-09-13 00:16:44 +00:00
|
|
|
AppendInternalKey(
|
|
|
|
&rep_, ParsedInternalKey(_user_key, 0, static_cast<ValueType>(0)));
|
2015-04-24 01:08:37 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// sets the internal key to be smaller or equal to all internal keys with this
|
|
|
|
// user key
|
|
|
|
void SetMinPossibleForUserKey(const Slice& _user_key) {
|
2017-09-13 00:16:44 +00:00
|
|
|
AppendInternalKey(&rep_, ParsedInternalKey(_user_key, kMaxSequenceNumber,
|
|
|
|
kValueTypeForSeek));
|
2015-04-24 01:08:37 +00:00
|
|
|
}
|
|
|
|
|
[fix] SIGSEGV when VersionEdit in MANIFEST is corrupted
Summary:
This was reported by our customers in task #4295529.
Cause:
* MANIFEST file contains a VersionEdit, which contains file entries whose 'smallest' and 'largest' internal keys are empty. String with zero characters. Root cause of corruption was not investigated. We should report corruption when this happens. However, we currently SIGSEGV.
Here's what happens:
* VersionEdit encodes zero-strings happily and stores them in smallest and largest InternalKeys. InternalKey::Encode() does assert when `rep_.empty()`, but we don't assert in production environemnts. Also, we should never assert as a result of DB corruption.
* As part of our ConsistencyCheck, we call GetLiveFilesMetaData()
* GetLiveFilesMetadata() calls `file->largest.user_key().ToString()`
* user_key() function does: 1. assert(size > 8) (ooops, no assert), 2. returns `Slice(internal_key.data(), internal_key.size() - 8)`
* since `internal_key.size()` is unsigned int, this call translates to `Slice(whatever, 1298471928561892576182756)`. Bazinga.
Fix:
* VersionEdit checks if InternalKey is valid in `VersionEdit::GetInternalKey()`. If it's invalid, returns corruption.
Lessons learned:
* Always keep in mind that even if you `assert()`, production code will continue execution even if assert fails.
* Never `assert` based on DB corruption. Assert only if the code should guarantee that assert can't fail.
Test Plan: dumped offending manifest. Before: assert. Now: corruption
Reviewers: dhruba, haobo, sdong
Reviewed By: dhruba
CC: leveldb
Differential Revision: https://reviews.facebook.net/D18507
2014-05-07 23:52:12 +00:00
|
|
|
bool Valid() const {
|
|
|
|
ParsedInternalKey parsed;
|
2020-10-28 17:11:13 +00:00
|
|
|
return (ParseInternalKey(Slice(rep_), &parsed, false /* log_err_key */)
|
|
|
|
.ok()); // TODO
|
[fix] SIGSEGV when VersionEdit in MANIFEST is corrupted
Summary:
This was reported by our customers in task #4295529.
Cause:
* MANIFEST file contains a VersionEdit, which contains file entries whose 'smallest' and 'largest' internal keys are empty. String with zero characters. Root cause of corruption was not investigated. We should report corruption when this happens. However, we currently SIGSEGV.
Here's what happens:
* VersionEdit encodes zero-strings happily and stores them in smallest and largest InternalKeys. InternalKey::Encode() does assert when `rep_.empty()`, but we don't assert in production environemnts. Also, we should never assert as a result of DB corruption.
* As part of our ConsistencyCheck, we call GetLiveFilesMetaData()
* GetLiveFilesMetadata() calls `file->largest.user_key().ToString()`
* user_key() function does: 1. assert(size > 8) (ooops, no assert), 2. returns `Slice(internal_key.data(), internal_key.size() - 8)`
* since `internal_key.size()` is unsigned int, this call translates to `Slice(whatever, 1298471928561892576182756)`. Bazinga.
Fix:
* VersionEdit checks if InternalKey is valid in `VersionEdit::GetInternalKey()`. If it's invalid, returns corruption.
Lessons learned:
* Always keep in mind that even if you `assert()`, production code will continue execution even if assert fails.
* Never `assert` based on DB corruption. Assert only if the code should guarantee that assert can't fail.
Test Plan: dumped offending manifest. Before: assert. Now: corruption
Reviewers: dhruba, haobo, sdong
Reviewed By: dhruba
CC: leveldb
Differential Revision: https://reviews.facebook.net/D18507
2014-05-07 23:52:12 +00:00
|
|
|
}
|
|
|
|
|
2011-03-18 22:37:00 +00:00
|
|
|
void DecodeFrom(const Slice& s) { rep_.assign(s.data(), s.size()); }
|
|
|
|
Slice Encode() const {
|
|
|
|
assert(!rep_.empty());
|
|
|
|
return rep_;
|
|
|
|
}
|
|
|
|
|
|
|
|
Slice user_key() const { return ExtractUserKey(rep_); }
|
Account memory of FileMetaData in global memory limit (#9924)
Summary:
**Context/Summary:**
As revealed by heap profiling, allocation of `FileMetaData` for [newly created file added to a Version](https://github.com/facebook/rocksdb/pull/9924/files#diff-a6aa385940793f95a2c5b39cc670bd440c4547fa54fd44622f756382d5e47e43R774) can consume significant heap memory. This PR is to account that toward our global memory limit based on block cache capacity.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9924
Test Plan:
- Previous `make check` verified there are only 2 places where the memory of the allocated `FileMetaData` can be released
- New unit test `TEST_P(ChargeFileMetadataTestWithParam, Basic)`
- db bench (CPU cost of `charge_file_metadata` in write and compact)
- **write micros/op: -0.24%** : `TEST_TMPDIR=/dev/shm/testdb ./db_bench -benchmarks=fillseq -db=$TEST_TMPDIR -charge_file_metadata=1 (remove this option for pre-PR) -disable_auto_compactions=1 -write_buffer_size=100000 -num=4000000 | egrep 'fillseq'`
- **compact micros/op -0.87%** : `TEST_TMPDIR=/dev/shm/testdb ./db_bench -benchmarks=fillseq -db=$TEST_TMPDIR -charge_file_metadata=1 -disable_auto_compactions=1 -write_buffer_size=100000 -num=4000000 -numdistinct=1000 && ./db_bench -benchmarks=compact -db=$TEST_TMPDIR -use_existing_db=1 -charge_file_metadata=1 -disable_auto_compactions=1 | egrep 'compact'`
table 1 - write
#-run | (pre-PR) avg micros/op | std micros/op | (post-PR) micros/op | std micros/op | change (%)
-- | -- | -- | -- | -- | --
10 | 3.9711 | 0.264408 | 3.9914 | 0.254563 | 0.5111933721
20 | 3.83905 | 0.0664488 | 3.8251 | 0.0695456 | -0.3633711465
40 | 3.86625 | 0.136669 | 3.8867 | 0.143765 | 0.5289363078
80 | 3.87828 | 0.119007 | 3.86791 | 0.115674 | **-0.2673865734**
160 | 3.87677 | 0.162231 | 3.86739 | 0.16663 | **-0.2419539978**
table 2 - compact
#-run | (pre-PR) avg micros/op | std micros/op | (post-PR) micros/op | std micros/op | change (%)
-- | -- | -- | -- | -- | --
10 | 2,399,650.00 | 96,375.80 | 2,359,537.00 | 53,243.60 | -1.67
20 | 2,410,480.00 | 89,988.00 | 2,433,580.00 | 91,121.20 | 0.96
40 | 2.41E+06 | 121811 | 2.39E+06 | 131525 | **-0.96**
80 | 2.40E+06 | 134503 | 2.39E+06 | 108799 | **-0.78**
- stress test: `python3 tools/db_crashtest.py blackbox --charge_file_metadata=1 --cache_size=1` killed as normal
Reviewed By: ajkr
Differential Revision: D36055583
Pulled By: hx235
fbshipit-source-id: b60eab94707103cb1322cf815f05810ef0232625
2022-06-14 20:06:40 +00:00
|
|
|
size_t size() const { return rep_.size(); }
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2016-06-13 16:57:43 +00:00
|
|
|
void Set(const Slice& _user_key, SequenceNumber s, ValueType t) {
|
|
|
|
SetFrom(ParsedInternalKey(_user_key, s, t));
|
|
|
|
}
|
|
|
|
|
2022-09-30 23:13:03 +00:00
|
|
|
void Set(const Slice& _user_key_with_ts, SequenceNumber s, ValueType t,
|
|
|
|
const Slice& ts) {
|
2024-01-02 19:17:39 +00:00
|
|
|
ParsedInternalKey pik(_user_key_with_ts, s, t);
|
2022-09-30 23:13:03 +00:00
|
|
|
// Should not call pik.SetTimestamp() directly as it overwrites the buffer
|
|
|
|
// containing _user_key.
|
|
|
|
SetFrom(pik, ts);
|
|
|
|
}
|
|
|
|
|
2011-03-18 22:37:00 +00:00
|
|
|
void SetFrom(const ParsedInternalKey& p) {
|
|
|
|
rep_.clear();
|
|
|
|
AppendInternalKey(&rep_, p);
|
|
|
|
}
|
|
|
|
|
2022-09-30 23:13:03 +00:00
|
|
|
void SetFrom(const ParsedInternalKey& p, const Slice& ts) {
|
|
|
|
rep_.clear();
|
|
|
|
AppendInternalKeyWithDifferentTimestamp(&rep_, p, ts);
|
|
|
|
}
|
|
|
|
|
2011-03-18 22:37:00 +00:00
|
|
|
void Clear() { rep_.clear(); }
|
2011-10-05 23:30:28 +00:00
|
|
|
|
2016-12-01 15:00:17 +00:00
|
|
|
// The underlying representation.
|
|
|
|
// Intended only to be used together with ConvertFromUserKey().
|
|
|
|
std::string* rep() { return &rep_; }
|
|
|
|
|
|
|
|
// Assuming that *rep() contains a user key, this method makes internal key
|
|
|
|
// out of it in-place. This saves a memcpy compared to Set()/SetFrom().
|
|
|
|
void ConvertFromUserKey(SequenceNumber s, ValueType t) {
|
|
|
|
AppendInternalKeyFooter(&rep_, s, t);
|
|
|
|
}
|
|
|
|
|
2020-10-28 17:11:13 +00:00
|
|
|
std::string DebugString(bool hex) const;
|
2011-03-18 22:37:00 +00:00
|
|
|
};
|
|
|
|
|
2019-03-27 23:13:08 +00:00
|
|
|
inline int InternalKeyComparator::Compare(const InternalKey& a,
|
|
|
|
const InternalKey& b) const {
|
2011-03-18 22:37:00 +00:00
|
|
|
return Compare(a.Encode(), b.Encode());
|
|
|
|
}
|
|
|
|
|
2020-10-01 02:15:42 +00:00
|
|
|
inline Status ParseInternalKey(const Slice& internal_key,
|
2020-10-28 17:11:13 +00:00
|
|
|
ParsedInternalKey* result, bool log_err_key) {
|
2011-03-18 22:37:00 +00:00
|
|
|
const size_t n = internal_key.size();
|
2020-10-28 17:11:13 +00:00
|
|
|
|
2020-10-01 17:08:52 +00:00
|
|
|
if (n < kNumInternalBytes) {
|
2020-10-28 17:11:13 +00:00
|
|
|
return Status::Corruption("Corrupted Key: Internal Key too small. Size=" +
|
|
|
|
std::to_string(n) + ". ");
|
2020-10-01 17:08:52 +00:00
|
|
|
}
|
2020-10-28 17:11:13 +00:00
|
|
|
|
2020-10-01 17:08:52 +00:00
|
|
|
uint64_t num = DecodeFixed64(internal_key.data() + n - kNumInternalBytes);
|
2011-03-18 22:37:00 +00:00
|
|
|
unsigned char c = num & 0xff;
|
|
|
|
result->sequence = num >> 8;
|
|
|
|
result->type = static_cast<ValueType>(c);
|
2014-01-27 21:53:22 +00:00
|
|
|
assert(result->type <= ValueType::kMaxValue);
|
2020-10-01 17:08:52 +00:00
|
|
|
result->user_key = Slice(internal_key.data(), n - kNumInternalBytes);
|
2020-10-28 17:11:13 +00:00
|
|
|
|
|
|
|
if (IsExtendedValueType(result->type)) {
|
|
|
|
return Status::OK();
|
|
|
|
} else {
|
|
|
|
return Status::Corruption("Corrupted Key",
|
|
|
|
result->DebugString(log_err_key, true));
|
|
|
|
}
|
2011-03-18 22:37:00 +00:00
|
|
|
}
|
|
|
|
|
2015-07-14 07:21:41 +00:00
|
|
|
// Update the sequence number in the internal key.
|
|
|
|
// Guarantees not to invalidate ikey.data().
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 18:42:56 +00:00
|
|
|
inline void UpdateInternalKey(std::string* ikey, uint64_t seq, ValueType t) {
|
2015-07-14 07:21:41 +00:00
|
|
|
size_t ikey_sz = ikey->size();
|
2020-10-01 17:08:52 +00:00
|
|
|
assert(ikey_sz >= kNumInternalBytes);
|
2013-02-15 22:31:24 +00:00
|
|
|
uint64_t newval = (seq << 8) | t;
|
2015-07-14 07:21:41 +00:00
|
|
|
|
|
|
|
// Note: Since C++11, strings are guaranteed to be stored contiguously and
|
|
|
|
// string::operator[]() is guaranteed not to change ikey.data().
|
2020-10-01 17:08:52 +00:00
|
|
|
EncodeFixed64(&(*ikey)[ikey_sz - kNumInternalBytes], newval);
|
2013-02-15 22:31:24 +00:00
|
|
|
}
|
|
|
|
|
2013-06-14 05:09:08 +00:00
|
|
|
// Get the sequence number from the internal key
|
|
|
|
inline uint64_t GetInternalKeySeqno(const Slice& internal_key) {
|
|
|
|
const size_t n = internal_key.size();
|
2020-10-01 17:08:52 +00:00
|
|
|
assert(n >= kNumInternalBytes);
|
|
|
|
uint64_t num = DecodeFixed64(internal_key.data() + n - kNumInternalBytes);
|
2013-06-14 05:09:08 +00:00
|
|
|
return num >> 8;
|
|
|
|
}
|
|
|
|
|
2019-05-23 23:16:38 +00:00
|
|
|
// The class to store keys in an efficient way. It allows:
|
|
|
|
// 1. Users can either copy the key into it, or have it point to an unowned
|
|
|
|
// address.
|
|
|
|
// 2. For copied key, a short inline buffer is kept to reduce memory
|
|
|
|
// allocation for smaller keys.
|
|
|
|
// 3. It tracks user key or internal key, and allow conversion between them.
|
2014-04-07 23:56:26 +00:00
|
|
|
class IterKey {
|
|
|
|
public:
|
2015-12-16 20:08:30 +00:00
|
|
|
IterKey()
|
2017-04-04 21:17:16 +00:00
|
|
|
: buf_(space_),
|
|
|
|
key_(buf_),
|
|
|
|
key_size_(0),
|
2019-03-21 16:51:29 +00:00
|
|
|
buf_size_(sizeof(space_)),
|
2017-04-04 21:17:16 +00:00
|
|
|
is_user_key_(true) {}
|
2019-09-12 01:07:12 +00:00
|
|
|
// No copying allowed
|
|
|
|
IterKey(const IterKey&) = delete;
|
|
|
|
void operator=(const IterKey&) = delete;
|
2014-04-07 23:56:26 +00:00
|
|
|
|
2014-04-09 00:30:45 +00:00
|
|
|
~IterKey() { ResetBuffer(); }
|
2014-04-07 23:56:26 +00:00
|
|
|
|
2018-07-17 00:10:44 +00:00
|
|
|
// The bool will be picked up by the next calls to SetKey
|
|
|
|
void SetIsUserKey(bool is_user_key) { is_user_key_ = is_user_key; }
|
|
|
|
|
|
|
|
// Returns the key in whichever format that was provided to KeyIter
|
2022-11-01 05:28:58 +00:00
|
|
|
// If user-defined timestamp is enabled, then timestamp is included in the
|
|
|
|
// return result.
|
2018-07-17 00:10:44 +00:00
|
|
|
Slice GetKey() const { return Slice(key_, key_size_); }
|
|
|
|
|
2017-04-04 21:17:16 +00:00
|
|
|
Slice GetInternalKey() const {
|
|
|
|
assert(!IsUserKey());
|
|
|
|
return Slice(key_, key_size_);
|
|
|
|
}
|
2014-04-07 23:56:26 +00:00
|
|
|
|
2022-11-01 05:28:58 +00:00
|
|
|
// If user-defined timestamp is enabled, then timestamp is included in the
|
|
|
|
// return result of GetUserKey();
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 18:42:56 +00:00
|
|
|
Slice GetUserKey() const {
|
2017-04-04 21:17:16 +00:00
|
|
|
if (IsUserKey()) {
|
|
|
|
return Slice(key_, key_size_);
|
|
|
|
} else {
|
2020-10-01 17:08:52 +00:00
|
|
|
assert(key_size_ >= kNumInternalBytes);
|
|
|
|
return Slice(key_, key_size_ - kNumInternalBytes);
|
2017-04-04 21:17:16 +00:00
|
|
|
}
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 18:42:56 +00:00
|
|
|
}
|
|
|
|
|
2015-09-18 18:10:00 +00:00
|
|
|
size_t Size() const { return key_size_; }
|
2014-07-23 19:31:11 +00:00
|
|
|
|
2014-04-09 00:30:45 +00:00
|
|
|
void Clear() { key_size_ = 0; }
|
2014-04-07 23:56:26 +00:00
|
|
|
|
2014-07-23 19:31:11 +00:00
|
|
|
// Append "non_shared_data" to its back, from "shared_len"
|
|
|
|
// This function is used in Block::Iter::ParseNextKey
|
|
|
|
// shared_len: bytes in [0, shard_len-1] would be remained
|
|
|
|
// non_shared_data: data to be append, its length must be >= non_shared_len
|
|
|
|
void TrimAppend(const size_t shared_len, const char* non_shared_data,
|
|
|
|
const size_t non_shared_len) {
|
|
|
|
assert(shared_len <= key_size_);
|
|
|
|
size_t total_size = shared_len + non_shared_len;
|
2015-12-16 20:08:30 +00:00
|
|
|
|
|
|
|
if (IsKeyPinned() /* key is not in buf_ */) {
|
|
|
|
// Copy the key from external memory to buf_ (copy shared_len bytes)
|
|
|
|
EnlargeBufferIfNeeded(total_size);
|
|
|
|
memcpy(buf_, key_, shared_len);
|
|
|
|
} else if (total_size > buf_size_) {
|
2014-07-23 19:31:11 +00:00
|
|
|
// Need to allocate space, delete previous space
|
|
|
|
char* p = new char[total_size];
|
|
|
|
memcpy(p, key_, shared_len);
|
|
|
|
|
2015-12-16 20:08:30 +00:00
|
|
|
if (buf_ != space_) {
|
|
|
|
delete[] buf_;
|
2014-07-23 19:31:11 +00:00
|
|
|
}
|
|
|
|
|
2015-12-16 20:08:30 +00:00
|
|
|
buf_ = p;
|
2014-07-23 19:31:11 +00:00
|
|
|
buf_size_ = total_size;
|
|
|
|
}
|
|
|
|
|
2015-12-16 20:08:30 +00:00
|
|
|
memcpy(buf_ + shared_len, non_shared_data, non_shared_len);
|
|
|
|
key_ = buf_;
|
|
|
|
key_size_ = total_size;
|
2014-07-23 19:31:11 +00:00
|
|
|
}
|
|
|
|
|
2023-05-25 22:41:32 +00:00
|
|
|
// A version of `TrimAppend` assuming the last bytes of length `ts_sz` in the
|
|
|
|
// user key part of `key_` is not counted towards shared bytes. And the
|
|
|
|
// decoded key needed a min timestamp of length `ts_sz` pad to the user key.
|
|
|
|
void TrimAppendWithTimestamp(const size_t shared_len,
|
|
|
|
const char* non_shared_data,
|
|
|
|
const size_t non_shared_len,
|
|
|
|
const size_t ts_sz) {
|
|
|
|
std::string kTsMin(ts_sz, static_cast<unsigned char>(0));
|
|
|
|
std::string key_with_ts;
|
|
|
|
std::vector<Slice> key_parts_with_ts;
|
|
|
|
if (IsUserKey()) {
|
|
|
|
key_parts_with_ts = {Slice(key_, shared_len),
|
|
|
|
Slice(non_shared_data, non_shared_len),
|
|
|
|
Slice(kTsMin)};
|
|
|
|
} else {
|
|
|
|
assert(shared_len + non_shared_len >= kNumInternalBytes);
|
|
|
|
// Invaraint: shared_user_key_len + shared_internal_bytes_len = shared_len
|
|
|
|
// In naming below `*_len` variables, keyword `user_key` refers to the
|
|
|
|
// user key part of the existing key in `key_` as apposed to the new key.
|
|
|
|
// Similary, `internal_bytes` refers to the footer part of the existing
|
|
|
|
// key. These bytes potentially will move between user key part and the
|
|
|
|
// footer part in the new key.
|
|
|
|
const size_t user_key_len = key_size_ - kNumInternalBytes;
|
|
|
|
const size_t sharable_user_key_len = user_key_len - ts_sz;
|
|
|
|
const size_t shared_user_key_len =
|
|
|
|
std::min(shared_len, sharable_user_key_len);
|
|
|
|
const size_t shared_internal_bytes_len = shared_len - shared_user_key_len;
|
|
|
|
|
|
|
|
// One Slice among the three Slices will get split into two Slices, plus
|
|
|
|
// a timestamp slice.
|
|
|
|
key_parts_with_ts.reserve(5);
|
|
|
|
bool ts_added = false;
|
|
|
|
// Add slice parts and find the right location to add the min timestamp.
|
|
|
|
MaybeAddKeyPartsWithTimestamp(
|
|
|
|
key_, shared_user_key_len,
|
|
|
|
shared_internal_bytes_len + non_shared_len < kNumInternalBytes,
|
|
|
|
shared_len + non_shared_len - kNumInternalBytes, kTsMin,
|
|
|
|
key_parts_with_ts, &ts_added);
|
|
|
|
MaybeAddKeyPartsWithTimestamp(
|
|
|
|
key_ + user_key_len, shared_internal_bytes_len,
|
|
|
|
non_shared_len < kNumInternalBytes,
|
|
|
|
shared_internal_bytes_len + non_shared_len - kNumInternalBytes,
|
|
|
|
kTsMin, key_parts_with_ts, &ts_added);
|
|
|
|
MaybeAddKeyPartsWithTimestamp(non_shared_data, non_shared_len,
|
|
|
|
non_shared_len >= kNumInternalBytes,
|
|
|
|
non_shared_len - kNumInternalBytes, kTsMin,
|
|
|
|
key_parts_with_ts, &ts_added);
|
|
|
|
assert(ts_added);
|
|
|
|
}
|
|
|
|
|
|
|
|
Slice new_key(SliceParts(&key_parts_with_ts.front(),
|
|
|
|
static_cast<int>(key_parts_with_ts.size())),
|
|
|
|
&key_with_ts);
|
|
|
|
SetKey(new_key);
|
|
|
|
}
|
|
|
|
|
2018-07-17 00:10:44 +00:00
|
|
|
Slice SetKey(const Slice& key, bool copy = true) {
|
|
|
|
// is_user_key_ expected to be set already via SetIsUserKey
|
|
|
|
return SetKeyImpl(key, copy);
|
|
|
|
}
|
|
|
|
|
2022-11-01 05:28:58 +00:00
|
|
|
// If user-defined timestamp is enabled, then `key` includes timestamp.
|
|
|
|
// TODO(yanqin) this is also used to set prefix, which do not include
|
|
|
|
// timestamp. Should be handled.
|
2017-04-04 21:17:16 +00:00
|
|
|
Slice SetUserKey(const Slice& key, bool copy = true) {
|
|
|
|
is_user_key_ = true;
|
|
|
|
return SetKeyImpl(key, copy);
|
|
|
|
}
|
|
|
|
|
|
|
|
Slice SetInternalKey(const Slice& key, bool copy = true) {
|
|
|
|
is_user_key_ = false;
|
|
|
|
return SetKeyImpl(key, copy);
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 18:42:56 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Copies the content of key, updates the reference to the user key in ikey
|
|
|
|
// and returns a Slice referencing the new copy.
|
2017-04-04 21:17:16 +00:00
|
|
|
Slice SetInternalKey(const Slice& key, ParsedInternalKey* ikey) {
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 18:42:56 +00:00
|
|
|
size_t key_n = key.size();
|
2020-10-01 17:08:52 +00:00
|
|
|
assert(key_n >= kNumInternalBytes);
|
2017-04-04 21:17:16 +00:00
|
|
|
SetInternalKey(key);
|
2020-10-01 17:08:52 +00:00
|
|
|
ikey->user_key = Slice(key_, key_n - kNumInternalBytes);
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 18:42:56 +00:00
|
|
|
return Slice(key_, key_n);
|
|
|
|
}
|
|
|
|
|
2016-10-18 23:59:37 +00:00
|
|
|
// Copy the key into IterKey own buf_
|
|
|
|
void OwnKey() {
|
|
|
|
assert(IsKeyPinned() == true);
|
|
|
|
|
|
|
|
Reserve(key_size_);
|
|
|
|
memcpy(buf_, key_, key_size_);
|
|
|
|
key_ = buf_;
|
|
|
|
}
|
|
|
|
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 18:42:56 +00:00
|
|
|
// Update the sequence number in the internal key. Guarantees not to
|
|
|
|
// invalidate slices to the key (and the user key).
|
Allow compaction iterator to perform garbage collection (#7556)
Summary:
Add a threshold timestamp, full_history_ts_low_ of type `std::string*` to
`CompactionIterator`, so that RocksDB can also perform garbage collection during
compaction.
* If full_history_ts_low_ is nullptr, then compaction iterator does not perform
GC, preserving all timestamp history for all keys. Compaction iterator will
treat user key with different timestamps as different user keys.
* If full_history_ts_low_ is not nullptr, then compaction iterator performs
GC. GC will look at keys older than `*full_history_ts_low_` and determine their
eligibility based on factors including snapshots.
Current rules of GC:
* If an internal key is in the same snapshot as a previous counterpart
with the same user key, and this key is eligible for GC, and the key is
not single-delete or merge operand, then this key can be dropped. Note
that the previous internal key cannot be a merge operand either.
* If a tombstone is the most recent one in the earliest snapshot and it
is eligible for GC, and keyNotExistsBeyondLevel() is true, then this
tombstone can be dropped.
* If a tombstone is the most recent one in a snapshot and it is eligible
for GC, and the compaction is at bottommost level, then all other older
internal keys of the same user key must also be eligible for GC, thus
can be dropped
* Single-delete, delete-range and merge are not currently supported.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7556
Test Plan: make check
Reviewed By: ltamasi
Differential Revision: D24507728
Pulled By: riversand963
fbshipit-source-id: 3c09c7301f41eed76dfcf4d1527e68cf6e0a8bb3
2020-10-24 05:58:05 +00:00
|
|
|
void UpdateInternalKey(uint64_t seq, ValueType t, const Slice* ts = nullptr) {
|
2015-12-16 20:08:30 +00:00
|
|
|
assert(!IsKeyPinned());
|
2020-10-01 17:08:52 +00:00
|
|
|
assert(key_size_ >= kNumInternalBytes);
|
Allow compaction iterator to perform garbage collection (#7556)
Summary:
Add a threshold timestamp, full_history_ts_low_ of type `std::string*` to
`CompactionIterator`, so that RocksDB can also perform garbage collection during
compaction.
* If full_history_ts_low_ is nullptr, then compaction iterator does not perform
GC, preserving all timestamp history for all keys. Compaction iterator will
treat user key with different timestamps as different user keys.
* If full_history_ts_low_ is not nullptr, then compaction iterator performs
GC. GC will look at keys older than `*full_history_ts_low_` and determine their
eligibility based on factors including snapshots.
Current rules of GC:
* If an internal key is in the same snapshot as a previous counterpart
with the same user key, and this key is eligible for GC, and the key is
not single-delete or merge operand, then this key can be dropped. Note
that the previous internal key cannot be a merge operand either.
* If a tombstone is the most recent one in the earliest snapshot and it
is eligible for GC, and keyNotExistsBeyondLevel() is true, then this
tombstone can be dropped.
* If a tombstone is the most recent one in a snapshot and it is eligible
for GC, and the compaction is at bottommost level, then all other older
internal keys of the same user key must also be eligible for GC, thus
can be dropped
* Single-delete, delete-range and merge are not currently supported.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7556
Test Plan: make check
Reviewed By: ltamasi
Differential Revision: D24507728
Pulled By: riversand963
fbshipit-source-id: 3c09c7301f41eed76dfcf4d1527e68cf6e0a8bb3
2020-10-24 05:58:05 +00:00
|
|
|
if (ts) {
|
|
|
|
assert(key_size_ >= kNumInternalBytes + ts->size());
|
|
|
|
memcpy(&buf_[key_size_ - kNumInternalBytes - ts->size()], ts->data(),
|
|
|
|
ts->size());
|
|
|
|
}
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 18:42:56 +00:00
|
|
|
uint64_t newval = (seq << 8) | t;
|
2020-10-01 17:08:52 +00:00
|
|
|
EncodeFixed64(&buf_[key_size_ - kNumInternalBytes], newval);
|
2014-04-07 23:56:26 +00:00
|
|
|
}
|
|
|
|
|
2015-12-16 20:08:30 +00:00
|
|
|
bool IsKeyPinned() const { return (key_ != buf_); }
|
|
|
|
|
2022-09-30 23:13:03 +00:00
|
|
|
// If `ts` is provided, user_key should not contain timestamp,
|
|
|
|
// and `ts` is appended after user_key.
|
|
|
|
// TODO: more efficient storage for timestamp.
|
2014-06-18 23:36:48 +00:00
|
|
|
void SetInternalKey(const Slice& key_prefix, const Slice& user_key,
|
|
|
|
SequenceNumber s,
|
2020-03-07 00:21:03 +00:00
|
|
|
ValueType value_type = kValueTypeForSeek,
|
|
|
|
const Slice* ts = nullptr) {
|
2014-06-18 23:36:48 +00:00
|
|
|
size_t psize = key_prefix.size();
|
2014-04-07 23:56:26 +00:00
|
|
|
size_t usize = user_key.size();
|
2020-03-07 00:21:03 +00:00
|
|
|
size_t ts_sz = (ts != nullptr ? ts->size() : 0);
|
|
|
|
EnlargeBufferIfNeeded(psize + usize + sizeof(uint64_t) + ts_sz);
|
2014-06-18 23:36:48 +00:00
|
|
|
if (psize > 0) {
|
2015-12-16 20:08:30 +00:00
|
|
|
memcpy(buf_, key_prefix.data(), psize);
|
2014-06-18 23:36:48 +00:00
|
|
|
}
|
2015-12-16 20:08:30 +00:00
|
|
|
memcpy(buf_ + psize, user_key.data(), usize);
|
2020-03-07 00:21:03 +00:00
|
|
|
if (ts) {
|
|
|
|
memcpy(buf_ + psize + usize, ts->data(), ts_sz);
|
|
|
|
}
|
|
|
|
EncodeFixed64(buf_ + usize + psize + ts_sz,
|
|
|
|
PackSequenceAndType(s, value_type));
|
2015-12-16 20:08:30 +00:00
|
|
|
|
|
|
|
key_ = buf_;
|
2020-03-07 00:21:03 +00:00
|
|
|
key_size_ = psize + usize + sizeof(uint64_t) + ts_sz;
|
2017-04-04 21:17:16 +00:00
|
|
|
is_user_key_ = false;
|
2014-06-18 23:36:48 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void SetInternalKey(const Slice& user_key, SequenceNumber s,
|
2020-03-07 00:21:03 +00:00
|
|
|
ValueType value_type = kValueTypeForSeek,
|
|
|
|
const Slice* ts = nullptr) {
|
|
|
|
SetInternalKey(Slice(), user_key, s, value_type, ts);
|
2014-06-18 23:36:48 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void Reserve(size_t size) {
|
|
|
|
EnlargeBufferIfNeeded(size);
|
|
|
|
key_size_ = size;
|
2014-04-07 23:56:26 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void SetInternalKey(const ParsedInternalKey& parsed_key) {
|
2014-06-18 23:36:48 +00:00
|
|
|
SetInternalKey(Slice(), parsed_key);
|
|
|
|
}
|
|
|
|
|
|
|
|
void SetInternalKey(const Slice& key_prefix,
|
|
|
|
const ParsedInternalKey& parsed_key_suffix) {
|
|
|
|
SetInternalKey(key_prefix, parsed_key_suffix.user_key,
|
|
|
|
parsed_key_suffix.sequence, parsed_key_suffix.type);
|
2014-04-07 23:56:26 +00:00
|
|
|
}
|
|
|
|
|
2014-07-01 18:05:05 +00:00
|
|
|
void EncodeLengthPrefixedKey(const Slice& key) {
|
|
|
|
auto size = key.size();
|
2014-11-11 21:47:22 +00:00
|
|
|
EnlargeBufferIfNeeded(size + static_cast<size_t>(VarintLength(size)));
|
2015-12-16 20:08:30 +00:00
|
|
|
char* ptr = EncodeVarint32(buf_, static_cast<uint32_t>(size));
|
2014-07-01 18:05:05 +00:00
|
|
|
memcpy(ptr, key.data(), size);
|
2015-12-16 20:08:30 +00:00
|
|
|
key_ = buf_;
|
2017-04-04 21:17:16 +00:00
|
|
|
is_user_key_ = true;
|
2014-07-01 18:05:05 +00:00
|
|
|
}
|
|
|
|
|
2017-04-04 21:17:16 +00:00
|
|
|
bool IsUserKey() const { return is_user_key_; }
|
|
|
|
|
2014-04-07 23:56:26 +00:00
|
|
|
private:
|
2015-12-16 20:08:30 +00:00
|
|
|
char* buf_;
|
|
|
|
const char* key_;
|
2014-04-07 23:56:26 +00:00
|
|
|
size_t key_size_;
|
2019-03-21 16:51:29 +00:00
|
|
|
size_t buf_size_;
|
2023-06-06 18:19:15 +00:00
|
|
|
char space_[39]; // Avoid allocation for short keys
|
2017-04-04 21:17:16 +00:00
|
|
|
bool is_user_key_;
|
|
|
|
|
|
|
|
Slice SetKeyImpl(const Slice& key, bool copy) {
|
|
|
|
size_t size = key.size();
|
|
|
|
if (copy) {
|
|
|
|
// Copy key to buf_
|
|
|
|
EnlargeBufferIfNeeded(size);
|
|
|
|
memcpy(buf_, key.data(), size);
|
|
|
|
key_ = buf_;
|
|
|
|
} else {
|
|
|
|
// Update key_ to point to external memory
|
|
|
|
key_ = key.data();
|
|
|
|
}
|
|
|
|
key_size_ = size;
|
|
|
|
return Slice(key_, key_size_);
|
|
|
|
}
|
2014-04-07 23:56:26 +00:00
|
|
|
|
2014-04-09 00:30:45 +00:00
|
|
|
void ResetBuffer() {
|
2015-12-16 20:08:30 +00:00
|
|
|
if (buf_ != space_) {
|
|
|
|
delete[] buf_;
|
|
|
|
buf_ = space_;
|
2014-04-09 00:30:45 +00:00
|
|
|
}
|
2014-04-24 00:51:16 +00:00
|
|
|
buf_size_ = sizeof(space_);
|
2014-04-09 00:30:45 +00:00
|
|
|
key_size_ = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Enlarge the buffer size if needed based on key_size.
|
|
|
|
// By default, static allocated buffer is used. Once there is a key
|
|
|
|
// larger than the static allocated buffer, another buffer is dynamically
|
|
|
|
// allocated, until a larger key buffer is requested. In that case, we
|
|
|
|
// reallocate buffer and delete the old one.
|
|
|
|
void EnlargeBufferIfNeeded(size_t key_size) {
|
|
|
|
// If size is smaller than buffer size, continue using current buffer,
|
|
|
|
// or the static allocated one, as default
|
|
|
|
if (key_size > buf_size_) {
|
2017-09-14 22:41:19 +00:00
|
|
|
EnlargeBuffer(key_size);
|
2014-04-09 00:30:45 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-09-14 22:41:19 +00:00
|
|
|
void EnlargeBuffer(size_t key_size);
|
2023-05-25 22:41:32 +00:00
|
|
|
|
|
|
|
void MaybeAddKeyPartsWithTimestamp(const char* slice_data,
|
|
|
|
const size_t slice_sz, bool add_timestamp,
|
|
|
|
const size_t left_sz,
|
|
|
|
const std::string& min_timestamp,
|
|
|
|
std::vector<Slice>& key_parts,
|
|
|
|
bool* ts_added) {
|
|
|
|
if (add_timestamp && !*ts_added) {
|
|
|
|
assert(slice_sz >= left_sz);
|
|
|
|
key_parts.emplace_back(slice_data, left_sz);
|
|
|
|
key_parts.emplace_back(min_timestamp);
|
|
|
|
key_parts.emplace_back(slice_data + left_sz, slice_sz - left_sz);
|
|
|
|
*ts_added = true;
|
|
|
|
} else {
|
|
|
|
key_parts.emplace_back(slice_data, slice_sz);
|
|
|
|
}
|
|
|
|
}
|
2014-04-07 23:56:26 +00:00
|
|
|
};
|
|
|
|
|
2021-03-26 04:17:17 +00:00
|
|
|
// Convert from a SliceTransform of user keys, to a SliceTransform of
|
2022-06-10 15:51:45 +00:00
|
|
|
// internal keys.
|
2014-04-10 21:19:43 +00:00
|
|
|
class InternalKeySliceTransform : public SliceTransform {
|
|
|
|
public:
|
|
|
|
explicit InternalKeySliceTransform(const SliceTransform* transform)
|
|
|
|
: transform_(transform) {}
|
|
|
|
|
2024-01-31 21:14:42 +00:00
|
|
|
const char* Name() const override { return transform_->Name(); }
|
2014-04-10 21:19:43 +00:00
|
|
|
|
2024-01-31 21:14:42 +00:00
|
|
|
Slice Transform(const Slice& src) const override {
|
2014-04-10 21:19:43 +00:00
|
|
|
auto user_key = ExtractUserKey(src);
|
|
|
|
return transform_->Transform(user_key);
|
|
|
|
}
|
|
|
|
|
2024-01-31 21:14:42 +00:00
|
|
|
bool InDomain(const Slice& src) const override {
|
2014-04-10 21:19:43 +00:00
|
|
|
auto user_key = ExtractUserKey(src);
|
|
|
|
return transform_->InDomain(user_key);
|
|
|
|
}
|
|
|
|
|
2024-01-31 21:14:42 +00:00
|
|
|
bool InRange(const Slice& dst) const override {
|
2014-04-10 21:19:43 +00:00
|
|
|
auto user_key = ExtractUserKey(dst);
|
|
|
|
return transform_->InRange(user_key);
|
|
|
|
}
|
|
|
|
|
|
|
|
const SliceTransform* user_prefix_extractor() const { return transform_; }
|
|
|
|
|
|
|
|
private:
|
|
|
|
// Like comparator, InternalKeySliceTransform will not take care of the
|
|
|
|
// deletion of transform_
|
|
|
|
const SliceTransform* const transform_;
|
|
|
|
};
|
|
|
|
|
2016-04-01 22:23:46 +00:00
|
|
|
// Read the key of a record from a write batch.
|
|
|
|
// if this record represent the default column family then cf_record
|
|
|
|
// must be passed as false, otherwise it must be passed as true.
|
Logically strip timestamp during flush (#11557)
Summary:
Logically strip the user-defined timestamp when L0 files are created during flush when `AdvancedColumnFamilyOptions.persist_user_defined_timestamps` is false. Logically stripping timestamp here means replacing the original user-defined timestamp with a mininum timestamp, which for now is hard coded to be all zeros bytes.
While working on this, I caught a missing piece on the `BlockBuilder` level for this feature. The current quick path `std::min(buffer_size, last_key_size)` needs a bit tweaking to work for this feature. When user-defined timestamp is stripped during block building, on writing first entry or right after resetting, `buffer` is empty and `buffer_size` is zero as usual. However, in follow-up writes, depending on the size of the stripped user-defined timestamp, and the size of the value, what's in `buffer` can sometimes be smaller than `last_key_size`, leading `std::min(buffer_size, last_key_size)` to truncate the `last_key`. Previous test doesn't caught the bug because in those tests, the size of the stripped user-defined timestamps bytes is smaller than the length of the value. In order to avoid the conditional operation, this PR changed the original trivial `std::min` operation into an arithmetic operation. Since this is a change in a hot and performance critical path, I did the following benchmark to check no observable regression is introduced.
```TEST_TMPDIR=/dev/shm/rocksdb1 ./db_bench -benchmarks=fillseq -memtablerep=vector -allow_concurrent_memtable_write=false -num=50000000```
Compiled with DEBUG_LEVEL=0
Test vs. control runs simulaneous for better accuracy, units = ops/sec
PR vs base:
Round 1: 350652 vs 349055
Round 2: 365733 vs 364308
Round 3: 355681 vs 354475
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11557
Test Plan:
New timestamp specific test added or existing tests augmented, both are parameterized with `UserDefinedTimestampTestMode`:
`UserDefinedTimestampTestMode::kNormal` -> UDT feature enabled, write / read with min timestamp
`UserDefinedTimestampTestMode::kStripUserDefinedTimestamps` -> UDT feature enabled, write / read with min timestamp, set Options.persist_user_defined_timestamps to false.
```
make all check
./db_wal_test --gtest_filter="*WithTimestamp*"
./flush_job_test --gtest_filter="*WithTimestamp*"
./repair_test --gtest_filter="*WithTimestamp*"
./block_based_table_reader_test
```
Reviewed By: pdillinger
Differential Revision: D47027664
Pulled By: jowlyzhang
fbshipit-source-id: e729193b6334dfc63aaa736d684d907a022571f5
2023-06-29 22:50:50 +00:00
|
|
|
bool ReadKeyFromWriteBatchEntry(Slice* input, Slice* key, bool cf_record);
|
2016-04-01 22:23:46 +00:00
|
|
|
|
2014-08-18 22:19:17 +00:00
|
|
|
// Read record from a write batch piece from input.
|
|
|
|
// tag, column_family, key, value and blob are return values. Callers own the
|
2021-12-10 19:03:39 +00:00
|
|
|
// slice they point to.
|
2014-08-18 22:19:17 +00:00
|
|
|
// Tag is defined as ValueType.
|
|
|
|
// input will be advanced to after the record.
|
2022-11-01 05:28:58 +00:00
|
|
|
// If user-defined timestamp is enabled for a column family, then the `key`
|
|
|
|
// resulting from this call will include timestamp.
|
Logically strip timestamp during flush (#11557)
Summary:
Logically strip the user-defined timestamp when L0 files are created during flush when `AdvancedColumnFamilyOptions.persist_user_defined_timestamps` is false. Logically stripping timestamp here means replacing the original user-defined timestamp with a mininum timestamp, which for now is hard coded to be all zeros bytes.
While working on this, I caught a missing piece on the `BlockBuilder` level for this feature. The current quick path `std::min(buffer_size, last_key_size)` needs a bit tweaking to work for this feature. When user-defined timestamp is stripped during block building, on writing first entry or right after resetting, `buffer` is empty and `buffer_size` is zero as usual. However, in follow-up writes, depending on the size of the stripped user-defined timestamp, and the size of the value, what's in `buffer` can sometimes be smaller than `last_key_size`, leading `std::min(buffer_size, last_key_size)` to truncate the `last_key`. Previous test doesn't caught the bug because in those tests, the size of the stripped user-defined timestamps bytes is smaller than the length of the value. In order to avoid the conditional operation, this PR changed the original trivial `std::min` operation into an arithmetic operation. Since this is a change in a hot and performance critical path, I did the following benchmark to check no observable regression is introduced.
```TEST_TMPDIR=/dev/shm/rocksdb1 ./db_bench -benchmarks=fillseq -memtablerep=vector -allow_concurrent_memtable_write=false -num=50000000```
Compiled with DEBUG_LEVEL=0
Test vs. control runs simulaneous for better accuracy, units = ops/sec
PR vs base:
Round 1: 350652 vs 349055
Round 2: 365733 vs 364308
Round 3: 355681 vs 354475
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11557
Test Plan:
New timestamp specific test added or existing tests augmented, both are parameterized with `UserDefinedTimestampTestMode`:
`UserDefinedTimestampTestMode::kNormal` -> UDT feature enabled, write / read with min timestamp
`UserDefinedTimestampTestMode::kStripUserDefinedTimestamps` -> UDT feature enabled, write / read with min timestamp, set Options.persist_user_defined_timestamps to false.
```
make all check
./db_wal_test --gtest_filter="*WithTimestamp*"
./flush_job_test --gtest_filter="*WithTimestamp*"
./repair_test --gtest_filter="*WithTimestamp*"
./block_based_table_reader_test
```
Reviewed By: pdillinger
Differential Revision: D47027664
Pulled By: jowlyzhang
fbshipit-source-id: e729193b6334dfc63aaa736d684d907a022571f5
2023-06-29 22:50:50 +00:00
|
|
|
Status ReadRecordFromWriteBatch(Slice* input, char* tag,
|
|
|
|
uint32_t* column_family, Slice* key,
|
Add initial support for TimedPut API (#12419)
Summary:
This PR adds support for `TimedPut` API. We introduced a new type `kTypeValuePreferredSeqno` for entries added to the DB via the `TimedPut` API.
The life cycle of such an entry on the write/flush/compaction paths are:
1) It is initially added to memtable as:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, write_unix_time}`
2) When it's flushed to L0 sst files, it's converted to:
`<user_key, seq, kTypeValuePreferredSeqno>: {value, preferred_seqno}`
when we have easy access to the seqno to time mapping.
3) During compaction, if certain conditions are met, we swap in the `preferred_seqno` and the entry will become:
`<user_key, preferred_seqno, kTypeValue>: value`. This step helps fast track these entries to the cold tier if they are eligible after the sequence number swap.
On the read path:
A `kTypeValuePreferredSeqno` entry acts the same as a `kTypeValue` entry, the unix_write_time/preferred seqno part packed in value is completely ignored.
Needed follow ups:
1) The seqno to time mapping accessible in flush needs to be extended to cover the `write_unix_time` for possible `kTypeValuePreferredSeqno` entries. This also means we need to track these `write_unix_time` in memtable.
2) Compaction filter support for the new `kTypeValuePreferredSeqno` type for feature parity with other `kTypeValue` and equivalent types.
3) Stress test coverage for the feature
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12419
Test Plan: Added unit tests
Reviewed By: pdillinger
Differential Revision: D54920296
Pulled By: jowlyzhang
fbshipit-source-id: c8b43f7a7c465e569141770e93c748371ff1da9e
2024-03-14 22:44:55 +00:00
|
|
|
Slice* value, Slice* blob, Slice* xid,
|
|
|
|
uint64_t* write_unix_time);
|
2016-08-19 22:10:31 +00:00
|
|
|
|
|
|
|
// When user call DeleteRange() to delete a range of keys,
|
|
|
|
// we will store a serialized RangeTombstone in MemTable and SST.
|
2022-09-30 23:13:03 +00:00
|
|
|
// the struct here is an easy-understood form
|
2016-08-19 22:10:31 +00:00
|
|
|
// start/end_key_ is the start/end user key of the range to be deleted
|
|
|
|
struct RangeTombstone {
|
|
|
|
Slice start_key_;
|
|
|
|
Slice end_key_;
|
|
|
|
SequenceNumber seq_;
|
2022-09-30 23:13:03 +00:00
|
|
|
// TODO: we should optimize the storage here when user-defined timestamp
|
|
|
|
// is NOT enabled: they currently take up (16 + 32 + 32) bytes per tombstone.
|
|
|
|
Slice ts_;
|
|
|
|
std::string pinned_start_key_;
|
|
|
|
std::string pinned_end_key_;
|
|
|
|
|
2016-12-20 00:44:30 +00:00
|
|
|
RangeTombstone() = default;
|
|
|
|
RangeTombstone(Slice sk, Slice ek, SequenceNumber sn)
|
2016-08-19 22:10:31 +00:00
|
|
|
: start_key_(sk), end_key_(ek), seq_(sn) {}
|
|
|
|
|
2022-09-30 23:13:03 +00:00
|
|
|
// User-defined timestamp is enabled, `sk` and `ek` should be user key
|
|
|
|
// with timestamp, `ts` will replace the timestamps in `sk` and
|
|
|
|
// `ek`.
|
2024-01-29 19:37:34 +00:00
|
|
|
// When `logical_strip_timestamp` is true, the timestamps in `sk` and `ek`
|
|
|
|
// will be replaced with min timestamp.
|
|
|
|
RangeTombstone(Slice sk, Slice ek, SequenceNumber sn, Slice ts,
|
|
|
|
bool logical_strip_timestamp)
|
|
|
|
: seq_(sn) {
|
|
|
|
const size_t ts_sz = ts.size();
|
|
|
|
assert(ts_sz > 0);
|
2022-09-30 23:13:03 +00:00
|
|
|
pinned_start_key_.reserve(sk.size());
|
|
|
|
pinned_end_key_.reserve(ek.size());
|
2024-01-29 19:37:34 +00:00
|
|
|
if (logical_strip_timestamp) {
|
|
|
|
AppendUserKeyWithMinTimestamp(&pinned_start_key_, sk, ts_sz);
|
|
|
|
AppendUserKeyWithMinTimestamp(&pinned_end_key_, ek, ts_sz);
|
|
|
|
} else {
|
|
|
|
AppendUserKeyWithDifferentTimestamp(&pinned_start_key_, sk, ts);
|
|
|
|
AppendUserKeyWithDifferentTimestamp(&pinned_end_key_, ek, ts);
|
|
|
|
}
|
2022-09-30 23:13:03 +00:00
|
|
|
start_key_ = pinned_start_key_;
|
|
|
|
end_key_ = pinned_end_key_;
|
2024-01-29 19:37:34 +00:00
|
|
|
ts_ = Slice(pinned_start_key_.data() + sk.size() - ts_sz, ts_sz);
|
2022-09-30 23:13:03 +00:00
|
|
|
}
|
|
|
|
|
2016-12-20 00:44:30 +00:00
|
|
|
RangeTombstone(ParsedInternalKey parsed_key, Slice value) {
|
Compaction Support for Range Deletion
Summary:
This diff introduces RangeDelAggregator, which takes ownership of iterators
provided to it via AddTombstones(). The tombstones are organized in a two-level
map (snapshot stripe -> begin key -> tombstone). Tombstone creation avoids data
copy by holding Slices returned by the iterator, which remain valid thanks to pinning.
For compaction, we create a hierarchical range tombstone iterator with structure
matching the iterator over compaction input data. An aggregator based on that
iterator is used by CompactionIterator to determine which keys are covered by
range tombstones. In case of merge operand, the same aggregator is used by
MergeHelper. Upon finishing each file in the compaction, relevant range tombstones
are added to the output file's range tombstone metablock and file boundaries are
updated accordingly.
To check whether a key is covered by range tombstone, RangeDelAggregator::ShouldDelete()
considers tombstones in the key's snapshot stripe. When this function is used outside of
compaction, it also checks newer stripes, which can contain covering tombstones. Currently
the intra-stripe check involves a linear scan; however, in the future we plan to collapse ranges
within a stripe such that binary search can be used.
RangeDelAggregator::AddToBuilder() adds all range tombstones in the table's key-range
to a new table's range tombstone meta-block. Since range tombstones may fall in the gap
between files, we may need to extend some files' key-ranges. The strategy is (1) first file
extends as far left as possible and other files do not extend left, (2) all files extend right
until either the start of the next file or the end of the last range tombstone in the gap,
whichever comes first.
One other notable change is adding release/move semantics to ScopedArenaIterator
such that it can be used to transfer ownership of an arena-allocated iterator, similar to
how unique_ptr is used for malloc'd data.
Depends on D61473
Test Plan: compaction_iterator_test, mock_table, end-to-end tests in D63927
Reviewers: sdong, IslamAbdelRahman, wanning, yhchiang, lightmark
Reviewed By: lightmark
Subscribers: andrewkr, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D62205
2016-10-18 19:04:56 +00:00
|
|
|
start_key_ = parsed_key.user_key;
|
|
|
|
seq_ = parsed_key.sequence;
|
|
|
|
end_key_ = value;
|
2016-08-19 22:10:31 +00:00
|
|
|
}
|
|
|
|
|
Compaction Support for Range Deletion
Summary:
This diff introduces RangeDelAggregator, which takes ownership of iterators
provided to it via AddTombstones(). The tombstones are organized in a two-level
map (snapshot stripe -> begin key -> tombstone). Tombstone creation avoids data
copy by holding Slices returned by the iterator, which remain valid thanks to pinning.
For compaction, we create a hierarchical range tombstone iterator with structure
matching the iterator over compaction input data. An aggregator based on that
iterator is used by CompactionIterator to determine which keys are covered by
range tombstones. In case of merge operand, the same aggregator is used by
MergeHelper. Upon finishing each file in the compaction, relevant range tombstones
are added to the output file's range tombstone metablock and file boundaries are
updated accordingly.
To check whether a key is covered by range tombstone, RangeDelAggregator::ShouldDelete()
considers tombstones in the key's snapshot stripe. When this function is used outside of
compaction, it also checks newer stripes, which can contain covering tombstones. Currently
the intra-stripe check involves a linear scan; however, in the future we plan to collapse ranges
within a stripe such that binary search can be used.
RangeDelAggregator::AddToBuilder() adds all range tombstones in the table's key-range
to a new table's range tombstone meta-block. Since range tombstones may fall in the gap
between files, we may need to extend some files' key-ranges. The strategy is (1) first file
extends as far left as possible and other files do not extend left, (2) all files extend right
until either the start of the next file or the end of the last range tombstone in the gap,
whichever comes first.
One other notable change is adding release/move semantics to ScopedArenaIterator
such that it can be used to transfer ownership of an arena-allocated iterator, similar to
how unique_ptr is used for malloc'd data.
Depends on D61473
Test Plan: compaction_iterator_test, mock_table, end-to-end tests in D63927
Reviewers: sdong, IslamAbdelRahman, wanning, yhchiang, lightmark
Reviewed By: lightmark
Subscribers: andrewkr, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D62205
2016-10-18 19:04:56 +00:00
|
|
|
// be careful to use Serialize(), allocates new memory
|
|
|
|
std::pair<InternalKey, Slice> Serialize() const {
|
2016-08-19 22:10:31 +00:00
|
|
|
auto key = InternalKey(start_key_, seq_, kTypeRangeDeletion);
|
2022-09-30 23:13:03 +00:00
|
|
|
return std::make_pair(std::move(key), end_key_);
|
2016-08-19 22:10:31 +00:00
|
|
|
}
|
|
|
|
|
Compaction Support for Range Deletion
Summary:
This diff introduces RangeDelAggregator, which takes ownership of iterators
provided to it via AddTombstones(). The tombstones are organized in a two-level
map (snapshot stripe -> begin key -> tombstone). Tombstone creation avoids data
copy by holding Slices returned by the iterator, which remain valid thanks to pinning.
For compaction, we create a hierarchical range tombstone iterator with structure
matching the iterator over compaction input data. An aggregator based on that
iterator is used by CompactionIterator to determine which keys are covered by
range tombstones. In case of merge operand, the same aggregator is used by
MergeHelper. Upon finishing each file in the compaction, relevant range tombstones
are added to the output file's range tombstone metablock and file boundaries are
updated accordingly.
To check whether a key is covered by range tombstone, RangeDelAggregator::ShouldDelete()
considers tombstones in the key's snapshot stripe. When this function is used outside of
compaction, it also checks newer stripes, which can contain covering tombstones. Currently
the intra-stripe check involves a linear scan; however, in the future we plan to collapse ranges
within a stripe such that binary search can be used.
RangeDelAggregator::AddToBuilder() adds all range tombstones in the table's key-range
to a new table's range tombstone meta-block. Since range tombstones may fall in the gap
between files, we may need to extend some files' key-ranges. The strategy is (1) first file
extends as far left as possible and other files do not extend left, (2) all files extend right
until either the start of the next file or the end of the last range tombstone in the gap,
whichever comes first.
One other notable change is adding release/move semantics to ScopedArenaIterator
such that it can be used to transfer ownership of an arena-allocated iterator, similar to
how unique_ptr is used for malloc'd data.
Depends on D61473
Test Plan: compaction_iterator_test, mock_table, end-to-end tests in D63927
Reviewers: sdong, IslamAbdelRahman, wanning, yhchiang, lightmark
Reviewed By: lightmark
Subscribers: andrewkr, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D62205
2016-10-18 19:04:56 +00:00
|
|
|
// be careful to use SerializeKey(), allocates new memory
|
|
|
|
InternalKey SerializeKey() const {
|
2016-08-19 22:10:31 +00:00
|
|
|
return InternalKey(start_key_, seq_, kTypeRangeDeletion);
|
|
|
|
}
|
Compaction Support for Range Deletion
Summary:
This diff introduces RangeDelAggregator, which takes ownership of iterators
provided to it via AddTombstones(). The tombstones are organized in a two-level
map (snapshot stripe -> begin key -> tombstone). Tombstone creation avoids data
copy by holding Slices returned by the iterator, which remain valid thanks to pinning.
For compaction, we create a hierarchical range tombstone iterator with structure
matching the iterator over compaction input data. An aggregator based on that
iterator is used by CompactionIterator to determine which keys are covered by
range tombstones. In case of merge operand, the same aggregator is used by
MergeHelper. Upon finishing each file in the compaction, relevant range tombstones
are added to the output file's range tombstone metablock and file boundaries are
updated accordingly.
To check whether a key is covered by range tombstone, RangeDelAggregator::ShouldDelete()
considers tombstones in the key's snapshot stripe. When this function is used outside of
compaction, it also checks newer stripes, which can contain covering tombstones. Currently
the intra-stripe check involves a linear scan; however, in the future we plan to collapse ranges
within a stripe such that binary search can be used.
RangeDelAggregator::AddToBuilder() adds all range tombstones in the table's key-range
to a new table's range tombstone meta-block. Since range tombstones may fall in the gap
between files, we may need to extend some files' key-ranges. The strategy is (1) first file
extends as far left as possible and other files do not extend left, (2) all files extend right
until either the start of the next file or the end of the last range tombstone in the gap,
whichever comes first.
One other notable change is adding release/move semantics to ScopedArenaIterator
such that it can be used to transfer ownership of an arena-allocated iterator, similar to
how unique_ptr is used for malloc'd data.
Depends on D61473
Test Plan: compaction_iterator_test, mock_table, end-to-end tests in D63927
Reviewers: sdong, IslamAbdelRahman, wanning, yhchiang, lightmark
Reviewed By: lightmark
Subscribers: andrewkr, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D62205
2016-10-18 19:04:56 +00:00
|
|
|
|
2018-07-14 00:34:54 +00:00
|
|
|
// The tombstone end-key is exclusive, so we generate an internal-key here
|
|
|
|
// which has a similar property. Using kMaxSequenceNumber guarantees that
|
|
|
|
// the returned internal-key will compare less than any other internal-key
|
|
|
|
// with the same user-key. This in turn guarantees that the serialized
|
|
|
|
// end-key for a tombstone such as [a-b] will compare less than the key "b".
|
|
|
|
//
|
Compaction Support for Range Deletion
Summary:
This diff introduces RangeDelAggregator, which takes ownership of iterators
provided to it via AddTombstones(). The tombstones are organized in a two-level
map (snapshot stripe -> begin key -> tombstone). Tombstone creation avoids data
copy by holding Slices returned by the iterator, which remain valid thanks to pinning.
For compaction, we create a hierarchical range tombstone iterator with structure
matching the iterator over compaction input data. An aggregator based on that
iterator is used by CompactionIterator to determine which keys are covered by
range tombstones. In case of merge operand, the same aggregator is used by
MergeHelper. Upon finishing each file in the compaction, relevant range tombstones
are added to the output file's range tombstone metablock and file boundaries are
updated accordingly.
To check whether a key is covered by range tombstone, RangeDelAggregator::ShouldDelete()
considers tombstones in the key's snapshot stripe. When this function is used outside of
compaction, it also checks newer stripes, which can contain covering tombstones. Currently
the intra-stripe check involves a linear scan; however, in the future we plan to collapse ranges
within a stripe such that binary search can be used.
RangeDelAggregator::AddToBuilder() adds all range tombstones in the table's key-range
to a new table's range tombstone meta-block. Since range tombstones may fall in the gap
between files, we may need to extend some files' key-ranges. The strategy is (1) first file
extends as far left as possible and other files do not extend left, (2) all files extend right
until either the start of the next file or the end of the last range tombstone in the gap,
whichever comes first.
One other notable change is adding release/move semantics to ScopedArenaIterator
such that it can be used to transfer ownership of an arena-allocated iterator, similar to
how unique_ptr is used for malloc'd data.
Depends on D61473
Test Plan: compaction_iterator_test, mock_table, end-to-end tests in D63927
Reviewers: sdong, IslamAbdelRahman, wanning, yhchiang, lightmark
Reviewed By: lightmark
Subscribers: andrewkr, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D62205
2016-10-18 19:04:56 +00:00
|
|
|
// be careful to use SerializeEndKey(), allocates new memory
|
|
|
|
InternalKey SerializeEndKey() const {
|
2022-09-30 23:13:03 +00:00
|
|
|
if (!ts_.empty()) {
|
|
|
|
static constexpr char kTsMax[] = "\xff\xff\xff\xff\xff\xff\xff\xff\xff";
|
|
|
|
if (ts_.size() <= strlen(kTsMax)) {
|
|
|
|
return InternalKey(end_key_, kMaxSequenceNumber, kTypeRangeDeletion,
|
|
|
|
Slice(kTsMax, ts_.size()));
|
|
|
|
} else {
|
|
|
|
return InternalKey(end_key_, kMaxSequenceNumber, kTypeRangeDeletion,
|
|
|
|
std::string(ts_.size(), '\xff'));
|
|
|
|
}
|
|
|
|
}
|
2018-07-14 00:34:54 +00:00
|
|
|
return InternalKey(end_key_, kMaxSequenceNumber, kTypeRangeDeletion);
|
Compaction Support for Range Deletion
Summary:
This diff introduces RangeDelAggregator, which takes ownership of iterators
provided to it via AddTombstones(). The tombstones are organized in a two-level
map (snapshot stripe -> begin key -> tombstone). Tombstone creation avoids data
copy by holding Slices returned by the iterator, which remain valid thanks to pinning.
For compaction, we create a hierarchical range tombstone iterator with structure
matching the iterator over compaction input data. An aggregator based on that
iterator is used by CompactionIterator to determine which keys are covered by
range tombstones. In case of merge operand, the same aggregator is used by
MergeHelper. Upon finishing each file in the compaction, relevant range tombstones
are added to the output file's range tombstone metablock and file boundaries are
updated accordingly.
To check whether a key is covered by range tombstone, RangeDelAggregator::ShouldDelete()
considers tombstones in the key's snapshot stripe. When this function is used outside of
compaction, it also checks newer stripes, which can contain covering tombstones. Currently
the intra-stripe check involves a linear scan; however, in the future we plan to collapse ranges
within a stripe such that binary search can be used.
RangeDelAggregator::AddToBuilder() adds all range tombstones in the table's key-range
to a new table's range tombstone meta-block. Since range tombstones may fall in the gap
between files, we may need to extend some files' key-ranges. The strategy is (1) first file
extends as far left as possible and other files do not extend left, (2) all files extend right
until either the start of the next file or the end of the last range tombstone in the gap,
whichever comes first.
One other notable change is adding release/move semantics to ScopedArenaIterator
such that it can be used to transfer ownership of an arena-allocated iterator, similar to
how unique_ptr is used for malloc'd data.
Depends on D61473
Test Plan: compaction_iterator_test, mock_table, end-to-end tests in D63927
Reviewers: sdong, IslamAbdelRahman, wanning, yhchiang, lightmark
Reviewed By: lightmark
Subscribers: andrewkr, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D62205
2016-10-18 19:04:56 +00:00
|
|
|
}
|
2016-08-19 22:10:31 +00:00
|
|
|
};
|
|
|
|
|
2019-03-27 23:13:08 +00:00
|
|
|
inline int InternalKeyComparator::Compare(const Slice& akey,
|
|
|
|
const Slice& bkey) const {
|
2018-03-23 20:16:37 +00:00
|
|
|
// Order by:
|
|
|
|
// increasing user key (according to user-supplied comparator)
|
|
|
|
// decreasing sequence number
|
|
|
|
// decreasing type (though sequence# should be enough to disambiguate)
|
2019-03-27 17:24:16 +00:00
|
|
|
int r = user_comparator_.Compare(ExtractUserKey(akey), ExtractUserKey(bkey));
|
2018-03-23 20:16:37 +00:00
|
|
|
if (r == 0) {
|
2020-10-01 17:08:52 +00:00
|
|
|
const uint64_t anum =
|
|
|
|
DecodeFixed64(akey.data() + akey.size() - kNumInternalBytes);
|
|
|
|
const uint64_t bnum =
|
|
|
|
DecodeFixed64(bkey.data() + bkey.size() - kNumInternalBytes);
|
2018-03-23 20:16:37 +00:00
|
|
|
if (anum > bnum) {
|
|
|
|
r = -1;
|
|
|
|
} else if (anum < bnum) {
|
|
|
|
r = +1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return r;
|
|
|
|
}
|
|
|
|
|
2019-03-27 23:13:08 +00:00
|
|
|
inline int InternalKeyComparator::CompareKeySeq(const Slice& akey,
|
|
|
|
const Slice& bkey) const {
|
2018-03-23 20:16:37 +00:00
|
|
|
// Order by:
|
|
|
|
// increasing user key (according to user-supplied comparator)
|
|
|
|
// decreasing sequence number
|
2019-03-27 17:24:16 +00:00
|
|
|
int r = user_comparator_.Compare(ExtractUserKey(akey), ExtractUserKey(bkey));
|
2018-03-23 20:16:37 +00:00
|
|
|
if (r == 0) {
|
|
|
|
// Shift the number to exclude the last byte which contains the value type
|
2020-10-01 17:08:52 +00:00
|
|
|
const uint64_t anum =
|
|
|
|
DecodeFixed64(akey.data() + akey.size() - kNumInternalBytes) >> 8;
|
|
|
|
const uint64_t bnum =
|
|
|
|
DecodeFixed64(bkey.data() + bkey.size() - kNumInternalBytes) >> 8;
|
2018-03-23 20:16:37 +00:00
|
|
|
if (anum > bnum) {
|
|
|
|
r = -1;
|
|
|
|
} else if (anum < bnum) {
|
|
|
|
r = +1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return r;
|
|
|
|
}
|
|
|
|
|
2023-11-21 01:07:28 +00:00
|
|
|
inline int InternalKeyComparator::CompareKeySeq(const ParsedInternalKey& a,
|
|
|
|
const Slice& b) const {
|
|
|
|
// Order by:
|
|
|
|
// increasing user key (according to user-supplied comparator)
|
|
|
|
// decreasing sequence number
|
|
|
|
int r = user_comparator_.Compare(a.user_key, ExtractUserKey(b));
|
|
|
|
if (r == 0) {
|
|
|
|
// Shift the number to exclude the last byte which contains the value type
|
|
|
|
const uint64_t anum = a.sequence;
|
|
|
|
const uint64_t bnum =
|
|
|
|
DecodeFixed64(b.data() + b.size() - kNumInternalBytes) >> 8;
|
|
|
|
if (anum > bnum) {
|
|
|
|
r = -1;
|
|
|
|
} else if (anum < bnum) {
|
|
|
|
r = +1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return r;
|
|
|
|
}
|
|
|
|
|
2020-07-08 00:25:08 +00:00
|
|
|
inline int InternalKeyComparator::Compare(const Slice& a,
|
|
|
|
SequenceNumber a_global_seqno,
|
|
|
|
const Slice& b,
|
|
|
|
SequenceNumber b_global_seqno) const {
|
|
|
|
int r = user_comparator_.Compare(ExtractUserKey(a), ExtractUserKey(b));
|
|
|
|
if (r == 0) {
|
|
|
|
uint64_t a_footer, b_footer;
|
|
|
|
if (a_global_seqno == kDisableGlobalSequenceNumber) {
|
|
|
|
a_footer = ExtractInternalKeyFooter(a);
|
|
|
|
} else {
|
|
|
|
a_footer = PackSequenceAndType(a_global_seqno, ExtractValueType(a));
|
|
|
|
}
|
|
|
|
if (b_global_seqno == kDisableGlobalSequenceNumber) {
|
|
|
|
b_footer = ExtractInternalKeyFooter(b);
|
|
|
|
} else {
|
|
|
|
b_footer = PackSequenceAndType(b_global_seqno, ExtractValueType(b));
|
|
|
|
}
|
|
|
|
if (a_footer > b_footer) {
|
|
|
|
r = -1;
|
|
|
|
} else if (a_footer < b_footer) {
|
|
|
|
r = +1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return r;
|
|
|
|
}
|
|
|
|
|
2019-05-23 23:16:38 +00:00
|
|
|
// Wrap InternalKeyComparator as a comparator class for ParsedInternalKey.
|
2018-10-25 03:13:06 +00:00
|
|
|
struct ParsedInternalKeyComparator {
|
|
|
|
explicit ParsedInternalKeyComparator(const InternalKeyComparator* c)
|
|
|
|
: cmp(c) {}
|
|
|
|
|
|
|
|
bool operator()(const ParsedInternalKey& a,
|
|
|
|
const ParsedInternalKey& b) const {
|
|
|
|
return cmp->Compare(a, b) < 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
const InternalKeyComparator* cmp;
|
|
|
|
};
|
|
|
|
|
2020-02-20 20:07:53 +00:00
|
|
|
} // namespace ROCKSDB_NAMESPACE
|