rocksdb/util/write_batch_util.h

//  Copyright (c) Meta Platforms, Inc. and affiliates.
//
//  This source code is licensed under both the GPLv2 (found in the
//  COPYING file in the root directory) and Apache 2.0 License
//  (found in the LICENSE.Apache file in the root directory).

#pragma once
#include <unordered_set>
#include <vector>

#include "rocksdb/slice.h"
#include "rocksdb/status.h"
#include "rocksdb/write_batch.h"

namespace ROCKSDB_NAMESPACE {
// ColumnFamilyCollector is a write batch handler which does nothing
// except recording unique column family IDs
class ColumnFamilyCollector : public WriteBatch::Handler {
  std::unordered_set<uint32_t> column_family_ids_;

  Status AddColumnFamilyId(uint32_t column_family_id) {
    column_family_ids_.insert(column_family_id);
    return Status::OK();
  }

 public:
  explicit ColumnFamilyCollector() {}

  ~ColumnFamilyCollector() override {}

  Status PutCF(uint32_t column_family_id, const Slice&, const Slice&) override {
    return AddColumnFamilyId(column_family_id);
  }

  Status PutEntityCF(uint32_t column_family_id, const Slice&,
                     const Slice&) override {
    return AddColumnFamilyId(column_family_id);
  }

  Status TimedPutCF(uint32_t column_family_id, const Slice&, const Slice&,
                    uint64_t) override {
    return AddColumnFamilyId(column_family_id);
  }

  Status DeleteCF(uint32_t column_family_id, const Slice&) override {
    return AddColumnFamilyId(column_family_id);
  }

  Status SingleDeleteCF(uint32_t column_family_id, const Slice&) override {
    return AddColumnFamilyId(column_family_id);
  }

  Status DeleteRangeCF(uint32_t column_family_id, const Slice&,
                       const Slice&) override {
    return AddColumnFamilyId(column_family_id);
  }

  Status MergeCF(uint32_t column_family_id, const Slice&,
                 const Slice&) override {
    return AddColumnFamilyId(column_family_id);
  }

  Status PutBlobIndexCF(uint32_t column_family_id, const Slice&,
                        const Slice&) override {
    return AddColumnFamilyId(column_family_id);
  }

  Status MarkBeginPrepare(bool) override { return Status::OK(); }

  Status MarkEndPrepare(const Slice&) override { return Status::OK(); }

  Status MarkRollback(const Slice&) override { return Status::OK(); }

  Status MarkCommit(const Slice&) override { return Status::OK(); }

  Status MarkCommitWithTimestamp(const Slice&, const Slice&) override {
    return Status::OK();
  }

  Status MarkNoop(bool) override { return Status::OK(); }

  const std::unordered_set<uint32_t>& column_families() const {
    return column_family_ids_;
  }
};

Status CollectColumnFamilyIdsFromWriteBatch(
    const WriteBatch& batch, std::vector<uint32_t>* column_family_ids);

}  // namespace ROCKSDB_NAMESPACE
Add utils to use for handling user defined timestamp size record in WAL (#11451) Summary: Add a util method `HandleWriteBatchTimestampSizeDifference` to handle a `WriteBatch` read from WAL log when user-defined timestamp size record is written and read. Two check modes are added: `kVerifyConsistency` that just verifies the recorded timestamp size are consistent with the running ones. This mode is to be used by `db_impl_secondary` for opening a DB as secondary instance. It will also be used by `db_impl_open` before the user comparator switch support is added to make a column switch between enabling/disable UDT feature. The other mode `kReconcileInconsistency` will be used by `db_impl_open` later when user comparator can be changed. Another change is to extract a method `CollectColumnFamilyIdsFromWriteBatch` in db_secondary_impl.h into its standalone util file so it can be shared. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11451 Test Plan: ``` make check ./udt_util_test ``` Reviewed By: ltamasi Differential Revision: D45894386 Pulled By: jowlyzhang fbshipit-source-id: b96790777f154cddab6d45d9ba2e5d20ebc6fe9d 2023-05-22 21:28:58 +00:00			`// Copyright (c) Meta Platforms, Inc. and affiliates.`
			`//`
			`// This source code is licensed under both the GPLv2 (found in the`
			`// COPYING file in the root directory) and Apache 2.0 License`
			`// (found in the LICENSE.Apache file in the root directory).`

			`#pragma once`
			`#include <unordered_set>`
			`#include <vector>`

			`#include "rocksdb/slice.h"`
			`#include "rocksdb/status.h"`
			`#include "rocksdb/write_batch.h"`

			`namespace ROCKSDB_NAMESPACE {`
			`// ColumnFamilyCollector is a write batch handler which does nothing`
			`// except recording unique column family IDs`
			`class ColumnFamilyCollector : public WriteBatch::Handler {`
			`std::unordered_set<uint32_t> column_family_ids_;`

			`Status AddColumnFamilyId(uint32_t column_family_id) {`
			`column_family_ids_.insert(column_family_id);`
			`return Status::OK();`
			`}`

			`public:`
			`explicit ColumnFamilyCollector() {}`

			`~ColumnFamilyCollector() override {}`

			`Status PutCF(uint32_t column_family_id, const Slice&, const Slice&) override {`
			`return AddColumnFamilyId(column_family_id);`
			`}`

Add timestamp support in dump_wal/dump/idump (#12690) Summary: As titled. For dumping wal files, since a mapping from column family id to the user comparator object is needed to print the timestamp in human readable format, option `[--db=<db_path>]` is added to `dump_wal` command to allow the user to choose to optionally open the DB as read only instance and dump the wal file with better timestamp formatting. Pull Request resolved: https://github.com/facebook/rocksdb/pull/12690 Test Plan: Manually tested dump_wal: [dump a wal file specified with --walfile] ``` >> ./ldb --walfile=$TEST_DB/000004.log dump_wal --print_value >>1,1,28,13,PUT(0) : 0x666F6F0100000000000000 : 0x7631 (Column family id: [0] contained in WAL are not opened in DB. Applied default hex formatting for user key. Specify --db=<db_path> to open DB for better user key formatting if it contains timestamp.) ``` [dump with --db specified for better timestamp formatting] ``` >> ./ldb --walfile=$TEST_DB/000004.log dump_wal --db=$TEST_DB --print_value >> 1,1,28,13,PUT(0) : 0x666F6F\|timestamp:1 : 0x7631 ``` dump: [dump a file specified with --path] ``` >>./ldb --path=/tmp/rocksdbtest-501/column_family_test_75359_17910784957761284041/000004.log dump Sequence,Count,ByteSize,Physical Offset,Key(s) : value 1,1,28,13,PUT(0) : 0x666F6F0100000000000000 : 0x7631 (Column family id: [0] contained in WAL are not opened in DB. Applied default hex formatting for user key. Specify --db=<db_path> to open DB for better user key formatting if it contains timestamp.) ``` [dump db specified with --db] ``` >> ./ldb --db=/tmp/rocksdbtest-501/column_family_test_75359_17910784957761284041 dump >> foo\|timestamp:1 ==> v1 Keys in range: 1 ``` idump ``` ./ldb --db=$TEST_DB idump 'foo\|timestamp:1' seq:1, type:1 => v1 Internal keys in range: 1 ``` Reviewed By: ltamasi Differential Revision: D57755382 Pulled By: jowlyzhang fbshipit-source-id: a0a2ef80c92801cbf7bfccc64769c1191824362e 2024-05-24 03:26:57 +00:00			`Status PutEntityCF(uint32_t column_family_id, const Slice&,`
			`const Slice&) override {`
			`return AddColumnFamilyId(column_family_id);`
			`}`

Add initial support for TimedPut API (#12419) Summary: This PR adds support for `TimedPut` API. We introduced a new type `kTypeValuePreferredSeqno` for entries added to the DB via the `TimedPut` API. The life cycle of such an entry on the write/flush/compaction paths are: 1) It is initially added to memtable as: `<user_key, seq, kTypeValuePreferredSeqno>: {value, write_unix_time}` 2) When it's flushed to L0 sst files, it's converted to: `<user_key, seq, kTypeValuePreferredSeqno>: {value, preferred_seqno}` when we have easy access to the seqno to time mapping. 3) During compaction, if certain conditions are met, we swap in the `preferred_seqno` and the entry will become: `<user_key, preferred_seqno, kTypeValue>: value`. This step helps fast track these entries to the cold tier if they are eligible after the sequence number swap. On the read path: A `kTypeValuePreferredSeqno` entry acts the same as a `kTypeValue` entry, the unix_write_time/preferred seqno part packed in value is completely ignored. Needed follow ups: 1) The seqno to time mapping accessible in flush needs to be extended to cover the `write_unix_time` for possible `kTypeValuePreferredSeqno` entries. This also means we need to track these `write_unix_time` in memtable. 2) Compaction filter support for the new `kTypeValuePreferredSeqno` type for feature parity with other `kTypeValue` and equivalent types. 3) Stress test coverage for the feature Pull Request resolved: https://github.com/facebook/rocksdb/pull/12419 Test Plan: Added unit tests Reviewed By: pdillinger Differential Revision: D54920296 Pulled By: jowlyzhang fbshipit-source-id: c8b43f7a7c465e569141770e93c748371ff1da9e 2024-03-14 22:44:55 +00:00			`Status TimedPutCF(uint32_t column_family_id, const Slice&, const Slice&,`
			`uint64_t) override {`
			`return AddColumnFamilyId(column_family_id);`
			`}`

Add utils to use for handling user defined timestamp size record in WAL (#11451) Summary: Add a util method `HandleWriteBatchTimestampSizeDifference` to handle a `WriteBatch` read from WAL log when user-defined timestamp size record is written and read. Two check modes are added: `kVerifyConsistency` that just verifies the recorded timestamp size are consistent with the running ones. This mode is to be used by `db_impl_secondary` for opening a DB as secondary instance. It will also be used by `db_impl_open` before the user comparator switch support is added to make a column switch between enabling/disable UDT feature. The other mode `kReconcileInconsistency` will be used by `db_impl_open` later when user comparator can be changed. Another change is to extract a method `CollectColumnFamilyIdsFromWriteBatch` in db_secondary_impl.h into its standalone util file so it can be shared. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11451 Test Plan: ``` make check ./udt_util_test ``` Reviewed By: ltamasi Differential Revision: D45894386 Pulled By: jowlyzhang fbshipit-source-id: b96790777f154cddab6d45d9ba2e5d20ebc6fe9d 2023-05-22 21:28:58 +00:00			`Status DeleteCF(uint32_t column_family_id, const Slice&) override {`
			`return AddColumnFamilyId(column_family_id);`
			`}`

			`Status SingleDeleteCF(uint32_t column_family_id, const Slice&) override {`
			`return AddColumnFamilyId(column_family_id);`
			`}`

			`Status DeleteRangeCF(uint32_t column_family_id, const Slice&,`
			`const Slice&) override {`
			`return AddColumnFamilyId(column_family_id);`
			`}`

			`Status MergeCF(uint32_t column_family_id, const Slice&,`
			`const Slice&) override {`
			`return AddColumnFamilyId(column_family_id);`
			`}`

			`Status PutBlobIndexCF(uint32_t column_family_id, const Slice&,`
			`const Slice&) override {`
			`return AddColumnFamilyId(column_family_id);`
			`}`

			`Status MarkBeginPrepare(bool) override { return Status::OK(); }`

			`Status MarkEndPrepare(const Slice&) override { return Status::OK(); }`

			`Status MarkRollback(const Slice&) override { return Status::OK(); }`

			`Status MarkCommit(const Slice&) override { return Status::OK(); }`

			`Status MarkCommitWithTimestamp(const Slice&, const Slice&) override {`
			`return Status::OK();`
			`}`

			`Status MarkNoop(bool) override { return Status::OK(); }`

			`const std::unordered_set<uint32_t>& column_families() const {`
			`return column_family_ids_;`
			`}`
			`};`

			`Status CollectColumnFamilyIdsFromWriteBatch(`
			`const WriteBatch& batch, std::vector<uint32_t>* column_family_ids);`

			`} // namespace ROCKSDB_NAMESPACE`