mirror of
https://github.com/facebook/rocksdb.git
synced 2024-11-27 02:44:18 +00:00
99b371b417
Summary: **Context/Summary:** We ignore trace writing status e.g,543191f2ea/db/db_impl/db_impl_write.cc (L221-L222)
If a write into the trace file fails, subsequent trace write will continue onto the same file. This will trigger the assertion `assert(sync_without_flush_called_)` intended to catch write to a file that has previously seen error, added in https://github.com/facebook/rocksdb/pull/10489, https://github.com/facebook/rocksdb/pull/10555 Alternative (rejected) is to handle trace writing status at a higher level at e.g,543191f2ea/db/db_impl/db_impl_write.cc (L221-L222)
. However, it makes sense to ignore such status considering tracing is not a critical but assistant component to db operation. And this alternative requires more code change. So it's better to handle the failure at a lower level as this PR Pull Request resolved: https://github.com/facebook/rocksdb/pull/11996 Test Plan: Add new UT failed before this PR and pass after Reviewed By: akankshamahajan15 Differential Revision: D50532467 Pulled By: hx235 fbshipit-source-id: f2032abafd94917adbf89a20841d15b448782a33
185 lines
6 KiB
C++
185 lines
6 KiB
C++
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
#pragma once
|
|
|
|
#include <atomic>
|
|
#include <memory>
|
|
#include <mutex>
|
|
#include <unordered_map>
|
|
#include <utility>
|
|
|
|
#include "rocksdb/options.h"
|
|
#include "rocksdb/rocksdb_namespace.h"
|
|
#include "rocksdb/status.h"
|
|
#include "rocksdb/trace_record.h"
|
|
#include "rocksdb/utilities/replayer.h"
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
// This file contains Tracer and Replayer classes that enable capturing and
|
|
// replaying RocksDB traces.
|
|
|
|
class ColumnFamilyHandle;
|
|
class ColumnFamilyData;
|
|
class DB;
|
|
class DBImpl;
|
|
class Env;
|
|
class Slice;
|
|
class SystemClock;
|
|
class TraceReader;
|
|
class TraceWriter;
|
|
class WriteBatch;
|
|
|
|
struct ReadOptions;
|
|
struct TraceOptions;
|
|
struct WriteOptions;
|
|
|
|
extern const std::string kTraceMagic;
|
|
const unsigned int kTraceTimestampSize = 8;
|
|
const unsigned int kTraceTypeSize = 1;
|
|
const unsigned int kTracePayloadLengthSize = 4;
|
|
const unsigned int kTraceMetadataSize =
|
|
kTraceTimestampSize + kTraceTypeSize + kTracePayloadLengthSize;
|
|
|
|
static const int kTraceFileMajorVersion = 0;
|
|
static const int kTraceFileMinorVersion = 2;
|
|
|
|
// The data structure that defines a single trace.
|
|
struct Trace {
|
|
uint64_t ts; // timestamp
|
|
TraceType type;
|
|
// Each bit in payload_map stores which corresponding struct member added in
|
|
// the payload. Each TraceType has its corresponding payload struct. For
|
|
// example, if bit at position 0 is set in write payload, then the write batch
|
|
// will be addedd.
|
|
uint64_t payload_map = 0;
|
|
// Each trace type has its own payload_struct, which will be serilized in the
|
|
// payload.
|
|
std::string payload;
|
|
|
|
void reset() {
|
|
ts = 0;
|
|
type = kTraceMax;
|
|
payload_map = 0;
|
|
payload.clear();
|
|
}
|
|
};
|
|
|
|
enum TracePayloadType : char {
|
|
// Each member of all query payload structs should have a corresponding flag
|
|
// here. Make sure to add them sequentially in the order of it is added.
|
|
kEmptyPayload = 0,
|
|
kWriteBatchData = 1,
|
|
kGetCFID = 2,
|
|
kGetKey = 3,
|
|
kIterCFID = 4,
|
|
kIterKey = 5,
|
|
kIterLowerBound = 6,
|
|
kIterUpperBound = 7,
|
|
kMultiGetSize = 8,
|
|
kMultiGetCFIDs = 9,
|
|
kMultiGetKeys = 10,
|
|
};
|
|
|
|
class TracerHelper {
|
|
public:
|
|
// Parse the string with major and minor version only
|
|
static Status ParseVersionStr(std::string& v_string, int* v_num);
|
|
|
|
// Parse the trace file version and db version in trace header
|
|
static Status ParseTraceHeader(const Trace& header, int* trace_version,
|
|
int* db_version);
|
|
|
|
// Encode a version 0.1 trace object into the given string.
|
|
static void EncodeTrace(const Trace& trace, std::string* encoded_trace);
|
|
|
|
// Decode a string into the given trace object.
|
|
static Status DecodeTrace(const std::string& encoded_trace, Trace* trace);
|
|
|
|
// Decode a string into the given trace header.
|
|
static Status DecodeHeader(const std::string& encoded_trace, Trace* header);
|
|
|
|
// Set the payload map based on the payload type
|
|
static bool SetPayloadMap(uint64_t& payload_map,
|
|
const TracePayloadType payload_type);
|
|
|
|
// Decode a Trace object into the corresponding TraceRecord.
|
|
// Return Status::OK() if nothing is wrong, record will be set accordingly.
|
|
// Return Status::NotSupported() if the trace type is not support, or the
|
|
// corresponding error status, record will be set to nullptr.
|
|
static Status DecodeTraceRecord(Trace* trace, int trace_file_version,
|
|
std::unique_ptr<TraceRecord>* record);
|
|
};
|
|
|
|
// Tracer captures all RocksDB operations using a user-provided TraceWriter.
|
|
// Every RocksDB operation is written as a single trace. Each trace will have a
|
|
// timestamp and type, followed by the trace payload.
|
|
class Tracer {
|
|
public:
|
|
Tracer(SystemClock* clock, const TraceOptions& trace_options,
|
|
std::unique_ptr<TraceWriter>&& trace_writer);
|
|
~Tracer();
|
|
|
|
// Trace all write operations -- Put, Merge, Delete, SingleDelete, Write
|
|
Status Write(WriteBatch* write_batch);
|
|
|
|
// Trace Get operations.
|
|
Status Get(ColumnFamilyHandle* cfname, const Slice& key);
|
|
|
|
// Trace Iterators.
|
|
Status IteratorSeek(const uint32_t& cf_id, const Slice& key,
|
|
const Slice& lower_bound, const Slice upper_bound);
|
|
Status IteratorSeekForPrev(const uint32_t& cf_id, const Slice& key,
|
|
const Slice& lower_bound, const Slice upper_bound);
|
|
|
|
// Trace MultiGet
|
|
|
|
Status MultiGet(const size_t num_keys, ColumnFamilyHandle** column_families,
|
|
const Slice* keys);
|
|
|
|
Status MultiGet(const size_t num_keys, ColumnFamilyHandle* column_family,
|
|
const Slice* keys);
|
|
|
|
Status MultiGet(const std::vector<ColumnFamilyHandle*>& column_family,
|
|
const std::vector<Slice>& keys);
|
|
|
|
// Returns true if the trace is over the configured max trace file limit.
|
|
// False otherwise.
|
|
bool IsTraceFileOverMax();
|
|
|
|
// Returns true if the order of write trace records must match the order of
|
|
// the corresponding records logged to WAL and applied to the DB.
|
|
bool IsWriteOrderPreserved() { return trace_options_.preserve_write_order; }
|
|
|
|
// Writes a trace footer at the end of the tracing
|
|
Status Close();
|
|
|
|
private:
|
|
// Write a trace header at the beginning, typically on initiating a trace,
|
|
// with some metadata like a magic number, trace version, RocksDB version, and
|
|
// trace format.
|
|
Status WriteHeader();
|
|
|
|
// Write a trace footer, typically on ending a trace, with some metadata.
|
|
Status WriteFooter();
|
|
|
|
// Write a single trace using the provided TraceWriter to the underlying
|
|
// system, say, a filesystem or a streaming service.
|
|
Status WriteTrace(const Trace& trace);
|
|
|
|
// Helps in filtering and sampling of traces.
|
|
// Returns true if a trace should be skipped, false otherwise.
|
|
bool ShouldSkipTrace(const TraceType& type);
|
|
|
|
SystemClock* clock_;
|
|
TraceOptions trace_options_;
|
|
std::unique_ptr<TraceWriter> trace_writer_;
|
|
uint64_t trace_request_count_;
|
|
Status trace_write_status_;
|
|
};
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|