mirror of
https://github.com/facebook/rocksdb.git
synced 2024-11-30 22:41:48 +00:00
3dff28cf9b
Summary: For performance purposes, the lower level routines were changed to use a SystemClock* instead of a std::shared_ptr<SystemClock>. The shared ptr has some performance degradation on certain hardware classes. For most of the system, there is no risk of the pointer being deleted/invalid because the shared_ptr will be stored elsewhere. For example, the ImmutableDBOptions stores the Env which has a std::shared_ptr<SystemClock> in it. The SystemClock* within the ImmutableDBOptions is essentially a "short cut" to gain access to this constant resource. There were a few classes (PeriodicWorkScheduler?) where the "short cut" property did not hold. In those cases, the shared pointer was preserved. Using db_bench readrandom perf_level=3 on my EC2 box, this change performed as well or better than 6.17: 6.17: readrandom : 28.046 micros/op 854902 ops/sec; 61.3 MB/s (355999 of 355999 found) 6.18: readrandom : 32.615 micros/op 735306 ops/sec; 52.7 MB/s (290999 of 290999 found) PR: readrandom : 27.500 micros/op 871909 ops/sec; 62.5 MB/s (367999 of 367999 found) (Note that the times for 6.18 are prior to revert of the SystemClock). Pull Request resolved: https://github.com/facebook/rocksdb/pull/8033 Reviewed By: pdillinger Differential Revision: D27014563 Pulled By: mrambacher fbshipit-source-id: ad0459eba03182e454391b5926bf5cdd45657b67
259 lines
8.7 KiB
C++
259 lines
8.7 KiB
C++
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
#include "trace_replay/io_tracer.h"
|
|
|
|
#include <cinttypes>
|
|
#include <cstdio>
|
|
#include <cstdlib>
|
|
|
|
#include "db/db_impl/db_impl.h"
|
|
#include "db/dbformat.h"
|
|
#include "rocksdb/slice.h"
|
|
#include "rocksdb/system_clock.h"
|
|
#include "rocksdb/trace_reader_writer.h"
|
|
#include "util/coding.h"
|
|
#include "util/hash.h"
|
|
#include "util/string_util.h"
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
IOTraceWriter::IOTraceWriter(SystemClock* clock,
|
|
const TraceOptions& trace_options,
|
|
std::unique_ptr<TraceWriter>&& trace_writer)
|
|
: clock_(clock),
|
|
trace_options_(trace_options),
|
|
trace_writer_(std::move(trace_writer)) {}
|
|
|
|
Status IOTraceWriter::WriteIOOp(const IOTraceRecord& record) {
|
|
uint64_t trace_file_size = trace_writer_->GetFileSize();
|
|
if (trace_file_size > trace_options_.max_trace_file_size) {
|
|
return Status::OK();
|
|
}
|
|
Trace trace;
|
|
trace.ts = record.access_timestamp;
|
|
trace.type = record.trace_type;
|
|
PutFixed64(&trace.payload, record.io_op_data);
|
|
Slice file_operation(record.file_operation);
|
|
PutLengthPrefixedSlice(&trace.payload, file_operation);
|
|
PutFixed64(&trace.payload, record.latency);
|
|
Slice io_status(record.io_status);
|
|
PutLengthPrefixedSlice(&trace.payload, io_status);
|
|
Slice file_name(record.file_name);
|
|
PutLengthPrefixedSlice(&trace.payload, file_name);
|
|
|
|
// Each bit in io_op_data stores which corresponding info from IOTraceOp will
|
|
// be added in the trace. Foreg, if bit at position 1 is set then
|
|
// IOTraceOp::kIOLen (length) will be logged in the record (Since
|
|
// IOTraceOp::kIOLen = 1 in the enum). So find all the set positions in
|
|
// io_op_data one by one and, update corresponsing info in the trace record,
|
|
// unset that bit to find other set bits until io_op_data = 0.
|
|
/* Write remaining options based on io_op_data set by file operation */
|
|
int64_t io_op_data = static_cast<int64_t>(record.io_op_data);
|
|
while (io_op_data) {
|
|
// Find the rightmost set bit.
|
|
uint32_t set_pos = static_cast<uint32_t>(log2(io_op_data & -io_op_data));
|
|
switch (set_pos) {
|
|
case IOTraceOp::kIOFileSize:
|
|
PutFixed64(&trace.payload, record.file_size);
|
|
break;
|
|
case IOTraceOp::kIOLen:
|
|
PutFixed64(&trace.payload, record.len);
|
|
break;
|
|
case IOTraceOp::kIOOffset:
|
|
PutFixed64(&trace.payload, record.offset);
|
|
break;
|
|
default:
|
|
assert(false);
|
|
}
|
|
// unset the rightmost bit.
|
|
io_op_data &= (io_op_data - 1);
|
|
}
|
|
|
|
std::string encoded_trace;
|
|
TracerHelper::EncodeTrace(trace, &encoded_trace);
|
|
return trace_writer_->Write(encoded_trace);
|
|
}
|
|
|
|
Status IOTraceWriter::WriteHeader() {
|
|
Trace trace;
|
|
trace.ts = clock_->NowMicros();
|
|
trace.type = TraceType::kTraceBegin;
|
|
PutLengthPrefixedSlice(&trace.payload, kTraceMagic);
|
|
PutFixed32(&trace.payload, kMajorVersion);
|
|
PutFixed32(&trace.payload, kMinorVersion);
|
|
std::string encoded_trace;
|
|
TracerHelper::EncodeTrace(trace, &encoded_trace);
|
|
return trace_writer_->Write(encoded_trace);
|
|
}
|
|
|
|
IOTraceReader::IOTraceReader(std::unique_ptr<TraceReader>&& reader)
|
|
: trace_reader_(std::move(reader)) {}
|
|
|
|
Status IOTraceReader::ReadHeader(IOTraceHeader* header) {
|
|
assert(header != nullptr);
|
|
std::string encoded_trace;
|
|
Status s = trace_reader_->Read(&encoded_trace);
|
|
if (!s.ok()) {
|
|
return s;
|
|
}
|
|
Trace trace;
|
|
s = TracerHelper::DecodeTrace(encoded_trace, &trace);
|
|
if (!s.ok()) {
|
|
return s;
|
|
}
|
|
header->start_time = trace.ts;
|
|
Slice enc_slice = Slice(trace.payload);
|
|
Slice magic_number;
|
|
if (!GetLengthPrefixedSlice(&enc_slice, &magic_number)) {
|
|
return Status::Corruption(
|
|
"Corrupted header in the trace file: Failed to read the magic number.");
|
|
}
|
|
if (magic_number.ToString() != kTraceMagic) {
|
|
return Status::Corruption(
|
|
"Corrupted header in the trace file: Magic number does not match.");
|
|
}
|
|
if (!GetFixed32(&enc_slice, &header->rocksdb_major_version)) {
|
|
return Status::Corruption(
|
|
"Corrupted header in the trace file: Failed to read rocksdb major "
|
|
"version number.");
|
|
}
|
|
if (!GetFixed32(&enc_slice, &header->rocksdb_minor_version)) {
|
|
return Status::Corruption(
|
|
"Corrupted header in the trace file: Failed to read rocksdb minor "
|
|
"version number.");
|
|
}
|
|
// We should have retrieved all information in the header.
|
|
if (!enc_slice.empty()) {
|
|
return Status::Corruption(
|
|
"Corrupted header in the trace file: The length of header is too "
|
|
"long.");
|
|
}
|
|
return Status::OK();
|
|
}
|
|
|
|
Status IOTraceReader::ReadIOOp(IOTraceRecord* record) {
|
|
assert(record);
|
|
std::string encoded_trace;
|
|
Status s = trace_reader_->Read(&encoded_trace);
|
|
if (!s.ok()) {
|
|
return s;
|
|
}
|
|
Trace trace;
|
|
s = TracerHelper::DecodeTrace(encoded_trace, &trace);
|
|
if (!s.ok()) {
|
|
return s;
|
|
}
|
|
record->access_timestamp = trace.ts;
|
|
record->trace_type = trace.type;
|
|
Slice enc_slice = Slice(trace.payload);
|
|
|
|
if (!GetFixed64(&enc_slice, &record->io_op_data)) {
|
|
return Status::Incomplete(
|
|
"Incomplete access record: Failed to read trace data.");
|
|
}
|
|
Slice file_operation;
|
|
if (!GetLengthPrefixedSlice(&enc_slice, &file_operation)) {
|
|
return Status::Incomplete(
|
|
"Incomplete access record: Failed to read file operation.");
|
|
}
|
|
record->file_operation = file_operation.ToString();
|
|
if (!GetFixed64(&enc_slice, &record->latency)) {
|
|
return Status::Incomplete(
|
|
"Incomplete access record: Failed to read latency.");
|
|
}
|
|
Slice io_status;
|
|
if (!GetLengthPrefixedSlice(&enc_slice, &io_status)) {
|
|
return Status::Incomplete(
|
|
"Incomplete access record: Failed to read IO status.");
|
|
}
|
|
record->io_status = io_status.ToString();
|
|
Slice file_name;
|
|
if (!GetLengthPrefixedSlice(&enc_slice, &file_name)) {
|
|
return Status::Incomplete(
|
|
"Incomplete access record: Failed to read file name.");
|
|
}
|
|
record->file_name = file_name.ToString();
|
|
|
|
// Each bit in io_op_data stores which corresponding info from IOTraceOp will
|
|
// be added in the trace. Foreg, if bit at position 1 is set then
|
|
// IOTraceOp::kIOLen (length) will be logged in the record (Since
|
|
// IOTraceOp::kIOLen = 1 in the enum). So find all the set positions in
|
|
// io_op_data one by one and, update corresponsing info in the trace record,
|
|
// unset that bit to find other set bits until io_op_data = 0.
|
|
/* Read remaining options based on io_op_data set by file operation */
|
|
// Assuming 63 bits will be used at max.
|
|
int64_t io_op_data = static_cast<int64_t>(record->io_op_data);
|
|
while (io_op_data) {
|
|
// Find the rightmost set bit.
|
|
uint32_t set_pos = static_cast<uint32_t>(log2(io_op_data & -io_op_data));
|
|
switch (set_pos) {
|
|
case IOTraceOp::kIOFileSize:
|
|
if (!GetFixed64(&enc_slice, &record->file_size)) {
|
|
return Status::Incomplete(
|
|
"Incomplete access record: Failed to read file size.");
|
|
}
|
|
break;
|
|
case IOTraceOp::kIOLen:
|
|
if (!GetFixed64(&enc_slice, &record->len)) {
|
|
return Status::Incomplete(
|
|
"Incomplete access record: Failed to read length.");
|
|
}
|
|
break;
|
|
case IOTraceOp::kIOOffset:
|
|
if (!GetFixed64(&enc_slice, &record->offset)) {
|
|
return Status::Incomplete(
|
|
"Incomplete access record: Failed to read offset.");
|
|
}
|
|
break;
|
|
default:
|
|
assert(false);
|
|
}
|
|
// unset the rightmost bit.
|
|
io_op_data &= (io_op_data - 1);
|
|
}
|
|
return Status::OK();
|
|
}
|
|
|
|
IOTracer::IOTracer() : tracing_enabled(false) { writer_.store(nullptr); }
|
|
|
|
IOTracer::~IOTracer() { EndIOTrace(); }
|
|
|
|
Status IOTracer::StartIOTrace(SystemClock* clock,
|
|
const TraceOptions& trace_options,
|
|
std::unique_ptr<TraceWriter>&& trace_writer) {
|
|
InstrumentedMutexLock lock_guard(&trace_writer_mutex_);
|
|
if (writer_.load()) {
|
|
return Status::Busy();
|
|
}
|
|
trace_options_ = trace_options;
|
|
writer_.store(
|
|
new IOTraceWriter(clock, trace_options, std::move(trace_writer)));
|
|
tracing_enabled = true;
|
|
return writer_.load()->WriteHeader();
|
|
}
|
|
|
|
void IOTracer::EndIOTrace() {
|
|
InstrumentedMutexLock lock_guard(&trace_writer_mutex_);
|
|
if (!writer_.load()) {
|
|
return;
|
|
}
|
|
delete writer_.load();
|
|
writer_.store(nullptr);
|
|
tracing_enabled = false;
|
|
}
|
|
|
|
// TODO: Return status and handle that in file_system_tracer.h
|
|
void IOTracer::WriteIOOp(const IOTraceRecord& record) {
|
|
if (!writer_.load()) {
|
|
return;
|
|
}
|
|
InstrumentedMutexLock lock_guard(&trace_writer_mutex_);
|
|
if (!writer_.load()) {
|
|
return;
|
|
}
|
|
writer_.load()->WriteIOOp(record).PermitUncheckedError();
|
|
}
|
|
} // namespace ROCKSDB_NAMESPACE
|