// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under both the GPLv2 (found in the // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). // // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. #pragma once #include #include #include "db/flush_scheduler.h" #include "db/kv_checksum.h" #include "db/trim_history_scheduler.h" #include "db/write_thread.h" #include "rocksdb/db.h" #include "rocksdb/options.h" #include "rocksdb/types.h" #include "rocksdb/write_batch.h" #include "util/autovector.h" #include "util/cast_util.h" namespace ROCKSDB_NAMESPACE { class MemTable; class FlushScheduler; class ColumnFamilyData; class ColumnFamilyMemTables { public: virtual ~ColumnFamilyMemTables() {} virtual bool Seek(uint32_t column_family_id) = 0; // returns true if the update to memtable should be ignored // (useful when recovering from log whose updates have already // been processed) virtual uint64_t GetLogNumber() const = 0; virtual MemTable* GetMemTable() const = 0; virtual ColumnFamilyHandle* GetColumnFamilyHandle() = 0; virtual ColumnFamilyData* current() { return nullptr; } }; class ColumnFamilyMemTablesDefault : public ColumnFamilyMemTables { public: explicit ColumnFamilyMemTablesDefault(MemTable* mem) : ok_(false), mem_(mem) {} bool Seek(uint32_t column_family_id) override { ok_ = (column_family_id == 0); return ok_; } uint64_t GetLogNumber() const override { return 0; } MemTable* GetMemTable() const override { assert(ok_); return mem_; } ColumnFamilyHandle* GetColumnFamilyHandle() override { return nullptr; } private: bool ok_; MemTable* mem_; }; struct WriteBatch::ProtectionInfo { // `WriteBatch` usually doesn't contain a huge number of keys so protecting // with a fixed, non-configurable eight bytes per key may work well enough. autovector entries_; size_t GetBytesPerKey() const { return 8; } }; // WriteBatchInternal provides static methods for manipulating a // WriteBatch that we don't want in the public WriteBatch interface. class WriteBatchInternal { public: // WriteBatch header has an 8-byte sequence number followed by a 4-byte count. static constexpr size_t kHeader = 12; // WriteBatch methods with column_family_id instead of ColumnFamilyHandle* static Status Put(WriteBatch* batch, uint32_t column_family_id, const Slice& key, const Slice& value); static Status Put(WriteBatch* batch, uint32_t column_family_id, const SliceParts& key, const SliceParts& value); static Status TimedPut(WriteBatch* batch, uint32_t column_family_id, const Slice& key, const Slice& value, uint64_t unix_write_time); static Status PutEntity(WriteBatch* batch, uint32_t column_family_id, const Slice& key, const WideColumns& columns); static Status Delete(WriteBatch* batch, uint32_t column_family_id, const SliceParts& key); static Status Delete(WriteBatch* batch, uint32_t column_family_id, const Slice& key); static Status SingleDelete(WriteBatch* batch, uint32_t column_family_id, const SliceParts& key); static Status SingleDelete(WriteBatch* batch, uint32_t column_family_id, const Slice& key); static Status DeleteRange(WriteBatch* b, uint32_t column_family_id, const Slice& begin_key, const Slice& end_key); static Status DeleteRange(WriteBatch* b, uint32_t column_family_id, const SliceParts& begin_key, const SliceParts& end_key); static Status Merge(WriteBatch* batch, uint32_t column_family_id, const Slice& key, const Slice& value); static Status Merge(WriteBatch* batch, uint32_t column_family_id, const SliceParts& key, const SliceParts& value); static Status PutBlobIndex(WriteBatch* batch, uint32_t column_family_id, const Slice& key, const Slice& value); static Status MarkEndPrepare(WriteBatch* batch, const Slice& xid, const bool write_after_commit = true, const bool unprepared_batch = false); static Status MarkRollback(WriteBatch* batch, const Slice& xid); static Status MarkCommit(WriteBatch* batch, const Slice& xid); static Status MarkCommitWithTimestamp(WriteBatch* batch, const Slice& xid, const Slice& commit_ts); static Status InsertNoop(WriteBatch* batch); // Return the number of entries in the batch. static uint32_t Count(const WriteBatch* batch); // Set the count for the number of entries in the batch. static void SetCount(WriteBatch* batch, uint32_t n); // Return the sequence number for the start of this batch. static SequenceNumber Sequence(const WriteBatch* batch); // Store the specified number as the sequence number for the start of // this batch. static void SetSequence(WriteBatch* batch, SequenceNumber seq); // Returns the offset of the first entry in the batch. // This offset is only valid if the batch is not empty. static size_t GetFirstOffset(WriteBatch* batch); static Slice Contents(const WriteBatch* batch) { return Slice(batch->rep_); } static size_t ByteSize(const WriteBatch* batch) { return batch->rep_.size(); } static Status SetContents(WriteBatch* batch, const Slice& contents); static Status CheckSlicePartsLength(const SliceParts& key, const SliceParts& value); // Inserts batches[i] into memtable, for i in 0..num_batches-1 inclusive. // // If ignore_missing_column_families == true. WriteBatch // referencing non-existing column family will be ignored. // If ignore_missing_column_families == false, processing of the // batches will be stopped if a reference is found to a non-existing // column family and InvalidArgument() will be returned. The writes // in batches may be only partially applied at that point. // // If log_number is non-zero, the memtable will be updated only if // memtables->GetLogNumber() >= log_number. // // If flush_scheduler is non-null, it will be invoked if the memtable // should be flushed. // // Under concurrent use, the caller is responsible for making sure that // the memtables object itself is thread-local. static Status InsertInto( WriteThread::WriteGroup& write_group, SequenceNumber sequence, ColumnFamilyMemTables* memtables, FlushScheduler* flush_scheduler, TrimHistoryScheduler* trim_history_scheduler, bool ignore_missing_column_families = false, uint64_t log_number = 0, DB* db = nullptr, bool concurrent_memtable_writes = false, bool seq_per_batch = false, bool batch_per_txn = true); // Convenience form of InsertInto when you have only one batch // next_seq returns the seq after last sequence number used in MemTable insert static Status InsertInto( const WriteBatch* batch, ColumnFamilyMemTables* memtables, FlushScheduler* flush_scheduler, TrimHistoryScheduler* trim_history_scheduler, bool ignore_missing_column_families = false, uint64_t log_number = 0, DB* db = nullptr, bool concurrent_memtable_writes = false, SequenceNumber* next_seq = nullptr, bool* has_valid_writes = nullptr, bool seq_per_batch = false, bool batch_per_txn = true); static Status InsertInto(WriteThread::Writer* writer, SequenceNumber sequence, ColumnFamilyMemTables* memtables, FlushScheduler* flush_scheduler, TrimHistoryScheduler* trim_history_scheduler, bool ignore_missing_column_families = false, uint64_t log_number = 0, DB* db = nullptr, bool concurrent_memtable_writes = false, bool seq_per_batch = false, size_t batch_cnt = 0, bool batch_per_txn = true, bool hint_per_batch = false); // Appends src write batch to dst write batch and updates count in dst // write batch. Returns OK if the append is successful. Checks number of // checksum against count in dst and src write batches, and returns Corruption // if the count is inconsistent. static Status Append(WriteBatch* dst, const WriteBatch* src, const bool WAL_only = false); // Returns the byte size of appending a WriteBatch with ByteSize // leftByteSize and a WriteBatch with ByteSize rightByteSize static size_t AppendedByteSize(size_t leftByteSize, size_t rightByteSize); // Iterate over [begin, end) range of a write batch static Status Iterate(const WriteBatch* wb, WriteBatch::Handler* handler, size_t begin, size_t end); // This write batch includes the latest state that should be persisted. Such // state meant to be used only during recovery. static void SetAsLatestPersistentState(WriteBatch* b); static bool IsLatestPersistentState(const WriteBatch* b); static void SetDefaultColumnFamilyTimestampSize(WriteBatch* wb, size_t default_cf_ts_sz); static std::tuple GetColumnFamilyIdAndTimestampSize( WriteBatch* b, ColumnFamilyHandle* column_family); static bool TimestampsUpdateNeeded(const WriteBatch& wb) { return wb.needs_in_place_update_ts_; } static bool HasKeyWithTimestamp(const WriteBatch& wb) { return wb.has_key_with_ts_; } // Update per-key value protection information on this write batch. // If checksum is provided, the batch content is verfied against the checksum. static Status UpdateProtectionInfo(WriteBatch* wb, size_t bytes_per_key, uint64_t* checksum = nullptr); }; // LocalSavePoint is similar to a scope guard class LocalSavePoint { public: explicit LocalSavePoint(WriteBatch* batch) : batch_(batch), savepoint_(batch->GetDataSize(), batch->Count(), batch->content_flags_.load(std::memory_order_relaxed)) #ifndef NDEBUG , committed_(false) #endif { } #ifndef NDEBUG ~LocalSavePoint() { assert(committed_); } #endif Status commit() { #ifndef NDEBUG committed_ = true; #endif if (batch_->max_bytes_ && batch_->rep_.size() > batch_->max_bytes_) { batch_->rep_.resize(savepoint_.size); WriteBatchInternal::SetCount(batch_, savepoint_.count); if (batch_->prot_info_ != nullptr) { batch_->prot_info_->entries_.resize(savepoint_.count); } batch_->content_flags_.store(savepoint_.content_flags, std::memory_order_relaxed); return Status::MemoryLimit(); } return Status::OK(); } private: WriteBatch* batch_; SavePoint savepoint_; #ifndef NDEBUG bool committed_; #endif }; template class TimestampUpdater : public WriteBatch::Handler { public: explicit TimestampUpdater(WriteBatch::ProtectionInfo* prot_info, TimestampSizeFuncType&& ts_sz_func, const Slice& ts) : prot_info_(prot_info), ts_sz_func_(std::move(ts_sz_func)), timestamp_(ts) { assert(!timestamp_.empty()); } ~TimestampUpdater() override {} Status PutCF(uint32_t cf, const Slice& key, const Slice&) override { return UpdateTimestamp(cf, key); } Status DeleteCF(uint32_t cf, const Slice& key) override { return UpdateTimestamp(cf, key); } Status SingleDeleteCF(uint32_t cf, const Slice& key) override { return UpdateTimestamp(cf, key); } Status DeleteRangeCF(uint32_t cf, const Slice& begin_key, const Slice& end_key) override { Status s = UpdateTimestamp(cf, begin_key, true /* is_key */); if (s.ok()) { s = UpdateTimestamp(cf, end_key, false /* is_key */); } return s; } Status MergeCF(uint32_t cf, const Slice& key, const Slice&) override { return UpdateTimestamp(cf, key); } Status PutBlobIndexCF(uint32_t cf, const Slice& key, const Slice&) override { return UpdateTimestamp(cf, key); } Status MarkBeginPrepare(bool) override { return Status::OK(); } Status MarkEndPrepare(const Slice&) override { return Status::OK(); } Status MarkCommit(const Slice&) override { return Status::OK(); } Status MarkCommitWithTimestamp(const Slice&, const Slice&) override { return Status::OK(); } Status MarkRollback(const Slice&) override { return Status::OK(); } Status MarkNoop(bool /*empty_batch*/) override { return Status::OK(); } private: // @param is_key specifies whether the update is for key or value. Status UpdateTimestamp(uint32_t cf, const Slice& buf, bool is_key = true) { Status s = UpdateTimestampImpl(cf, buf, idx_, is_key); ++idx_; return s; } Status UpdateTimestampImpl(uint32_t cf, const Slice& buf, size_t /*idx*/, bool is_key) { if (timestamp_.empty()) { return Status::InvalidArgument("Timestamp is empty"); } size_t cf_ts_sz = ts_sz_func_(cf); if (0 == cf_ts_sz) { // Skip this column family. return Status::OK(); } else if (std::numeric_limits::max() == cf_ts_sz) { // Column family timestamp info not found. return Status::NotFound(); } else if (cf_ts_sz != timestamp_.size()) { return Status::InvalidArgument("timestamp size mismatch"); } UpdateProtectionInformationIfNeeded(buf, timestamp_, is_key); char* ptr = const_cast(buf.data() + buf.size() - cf_ts_sz); assert(ptr); memcpy(ptr, timestamp_.data(), timestamp_.size()); return Status::OK(); } void UpdateProtectionInformationIfNeeded(const Slice& buf, const Slice& ts, bool is_key) { if (prot_info_ != nullptr) { const size_t ts_sz = ts.size(); SliceParts old(&buf, 1); Slice old_no_ts(buf.data(), buf.size() - ts_sz); std::array new_key_cmpts{{old_no_ts, ts}}; SliceParts new_parts(new_key_cmpts.data(), 2); if (is_key) { prot_info_->entries_[idx_].UpdateK(old, new_parts); } else { prot_info_->entries_[idx_].UpdateV(old, new_parts); } } } // No copy or move. TimestampUpdater(const TimestampUpdater&) = delete; TimestampUpdater(TimestampUpdater&&) = delete; TimestampUpdater& operator=(const TimestampUpdater&) = delete; TimestampUpdater& operator=(TimestampUpdater&&) = delete; WriteBatch::ProtectionInfo* const prot_info_ = nullptr; const TimestampSizeFuncType ts_sz_func_{}; const Slice timestamp_; size_t idx_ = 0; }; } // namespace ROCKSDB_NAMESPACE