mirror of
https://github.com/facebook/rocksdb.git
synced 2024-11-29 18:33:58 +00:00
9d77bf8f7b
Summary: - Right now each read fragments the memtable range tombstones https://github.com/facebook/rocksdb/issues/4808. This PR explores the idea of fragmenting memtable range tombstones in the write path and reads can just read this cached fragmented tombstone without any fragmenting cost. This PR only does the caching for immutable memtable, and does so right before a memtable is added to an immutable memtable list. The fragmentation is done without holding mutex to minimize its performance impact. - db_bench is updated to print out the number of range deletions executed if there is any. Pull Request resolved: https://github.com/facebook/rocksdb/pull/10380 Test Plan: - CI, added asserts in various places to check whether a fragmented range tombstone list should have been constructed. - Benchmark: as this PR only optimizes immutable memtable path, the number of writes in the benchmark is chosen such an immutable memtable is created and range tombstones are in that memtable. ``` single thread: ./db_bench --benchmarks=fillrandom,readrandom --writes_per_range_tombstone=1 --max_write_buffer_number=100 --min_write_buffer_number_to_merge=100 --writes=500000 --reads=100000 --max_num_range_tombstones=100 multi_thread ./db_bench --benchmarks=fillrandom,readrandom --writes_per_range_tombstone=1 --max_write_buffer_number=100 --min_write_buffer_number_to_merge=100 --writes=15000 --reads=20000 --threads=32 --max_num_range_tombstones=100 ``` Commit99cdf16464
is included in benchmark result. It was an earlier attempt where tombstones are fragmented for each write operation. Reader threads share it using a shared_ptr which would slow down multi-thread read performance as seen in benchmark results. Results are averaged over 5 runs. Single thread result: | Max # tombstones | main fillrandom micros/op |99cdf16464
| Post PR | main readrandom micros/op |99cdf16464
| Post PR | | ------------- | ------------- |------------- |------------- |------------- |------------- |------------- | | 0 |6.68 |6.57 |6.72 |4.72 |4.79 |4.54 | | 1 |6.67 |6.58 |6.62 |5.41 |4.74 |4.72 | | 10 |6.59 |6.5 |6.56 |7.83 |4.69 |4.59 | | 100 |6.62 |6.75 |6.58 |29.57 |5.04 |5.09 | | 1000 |6.54 |6.82 |6.61 |320.33 |5.22 |5.21 | 32-thread result: note that "Max # tombstones" is per thread. | Max # tombstones | main fillrandom micros/op |99cdf16464
| Post PR | main readrandom micros/op |99cdf16464
| Post PR | | ------------- | ------------- |------------- |------------- |------------- |------------- |------------- | | 0 |234.52 |260.25 |239.42 |5.06 |5.38 |5.09 | | 1 |236.46 |262.0 |231.1 |19.57 |22.14 |5.45 | | 10 |236.95 |263.84 |251.49 |151.73 |21.61 |5.73 | | 100 |268.16 |296.8 |280.13 |2308.52 |22.27 |6.57 | Reviewed By: ajkr Differential Revision: D37916564 Pulled By: cbi42 fbshipit-source-id: 05d6d2e16df26c374c57ddcca13a5bfe9d5b731e
344 lines
12 KiB
C++
344 lines
12 KiB
C++
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
#include "db/db_impl/db_impl_readonly.h"
|
|
|
|
#include "db/arena_wrapped_db_iter.h"
|
|
#include "db/db_impl/compacted_db_impl.h"
|
|
#include "db/db_impl/db_impl.h"
|
|
#include "db/db_iter.h"
|
|
#include "db/merge_context.h"
|
|
#include "logging/logging.h"
|
|
#include "monitoring/perf_context_imp.h"
|
|
#include "util/cast_util.h"
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
DBImplReadOnly::DBImplReadOnly(const DBOptions& db_options,
|
|
const std::string& dbname)
|
|
: DBImpl(db_options, dbname, /*seq_per_batch*/ false,
|
|
/*batch_per_txn*/ true, /*read_only*/ true) {
|
|
ROCKS_LOG_INFO(immutable_db_options_.info_log,
|
|
"Opening the db in read only mode");
|
|
LogFlush(immutable_db_options_.info_log);
|
|
}
|
|
|
|
DBImplReadOnly::~DBImplReadOnly() {}
|
|
|
|
// Implementations of the DB interface
|
|
Status DBImplReadOnly::Get(const ReadOptions& read_options,
|
|
ColumnFamilyHandle* column_family, const Slice& key,
|
|
PinnableSlice* pinnable_val) {
|
|
return Get(read_options, column_family, key, pinnable_val,
|
|
/*timestamp*/ nullptr);
|
|
}
|
|
|
|
Status DBImplReadOnly::Get(const ReadOptions& read_options,
|
|
ColumnFamilyHandle* column_family, const Slice& key,
|
|
PinnableSlice* pinnable_val,
|
|
std::string* timestamp) {
|
|
assert(pinnable_val != nullptr);
|
|
// TODO: stopwatch DB_GET needed?, perf timer needed?
|
|
PERF_TIMER_GUARD(get_snapshot_time);
|
|
|
|
assert(column_family);
|
|
if (read_options.timestamp) {
|
|
const Status s = FailIfTsMismatchCf(
|
|
column_family, *(read_options.timestamp), /*ts_for_read=*/true);
|
|
if (!s.ok()) {
|
|
return s;
|
|
}
|
|
} else {
|
|
const Status s = FailIfCfHasTs(column_family);
|
|
if (!s.ok()) {
|
|
return s;
|
|
}
|
|
}
|
|
|
|
// Clear the timestamps for returning results so that we can distinguish
|
|
// between tombstone or key that has never been written
|
|
if (timestamp) {
|
|
timestamp->clear();
|
|
}
|
|
|
|
const Comparator* ucmp = column_family->GetComparator();
|
|
assert(ucmp);
|
|
std::string* ts = ucmp->timestamp_size() > 0 ? timestamp : nullptr;
|
|
|
|
Status s;
|
|
SequenceNumber snapshot = versions_->LastSequence();
|
|
GetWithTimestampReadCallback read_cb(snapshot);
|
|
auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
|
|
auto cfd = cfh->cfd();
|
|
if (tracer_) {
|
|
InstrumentedMutexLock lock(&trace_mutex_);
|
|
if (tracer_) {
|
|
tracer_->Get(column_family, key);
|
|
}
|
|
}
|
|
SuperVersion* super_version = cfd->GetSuperVersion();
|
|
MergeContext merge_context;
|
|
SequenceNumber max_covering_tombstone_seq = 0;
|
|
LookupKey lkey(key, snapshot, read_options.timestamp);
|
|
PERF_TIMER_STOP(get_snapshot_time);
|
|
if (super_version->mem->Get(lkey, pinnable_val->GetSelf(), ts, &s,
|
|
&merge_context, &max_covering_tombstone_seq,
|
|
read_options, false /* immutable_memtable */,
|
|
&read_cb)) {
|
|
pinnable_val->PinSelf();
|
|
RecordTick(stats_, MEMTABLE_HIT);
|
|
} else {
|
|
PERF_TIMER_GUARD(get_from_output_files_time);
|
|
PinnedIteratorsManager pinned_iters_mgr;
|
|
super_version->current->Get(
|
|
read_options, lkey, pinnable_val, ts, &s, &merge_context,
|
|
&max_covering_tombstone_seq, &pinned_iters_mgr,
|
|
/*value_found*/ nullptr,
|
|
/*key_exists*/ nullptr, /*seq*/ nullptr, &read_cb,
|
|
/*is_blob*/ nullptr,
|
|
/*do_merge*/ true);
|
|
RecordTick(stats_, MEMTABLE_MISS);
|
|
}
|
|
RecordTick(stats_, NUMBER_KEYS_READ);
|
|
size_t size = pinnable_val->size();
|
|
RecordTick(stats_, BYTES_READ, size);
|
|
RecordInHistogram(stats_, BYTES_PER_READ, size);
|
|
PERF_COUNTER_ADD(get_read_bytes, size);
|
|
return s;
|
|
}
|
|
|
|
Iterator* DBImplReadOnly::NewIterator(const ReadOptions& read_options,
|
|
ColumnFamilyHandle* column_family) {
|
|
assert(column_family);
|
|
if (read_options.timestamp) {
|
|
const Status s = FailIfTsMismatchCf(
|
|
column_family, *(read_options.timestamp), /*ts_for_read=*/true);
|
|
if (!s.ok()) {
|
|
return NewErrorIterator(s);
|
|
}
|
|
} else {
|
|
const Status s = FailIfCfHasTs(column_family);
|
|
if (!s.ok()) {
|
|
return NewErrorIterator(s);
|
|
}
|
|
}
|
|
auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
|
|
auto cfd = cfh->cfd();
|
|
SuperVersion* super_version = cfd->GetSuperVersion()->Ref();
|
|
SequenceNumber latest_snapshot = versions_->LastSequence();
|
|
SequenceNumber read_seq =
|
|
read_options.snapshot != nullptr
|
|
? reinterpret_cast<const SnapshotImpl*>(read_options.snapshot)
|
|
->number_
|
|
: latest_snapshot;
|
|
ReadCallback* read_callback = nullptr; // No read callback provided.
|
|
auto db_iter = NewArenaWrappedDbIterator(
|
|
env_, read_options, *cfd->ioptions(), super_version->mutable_cf_options,
|
|
super_version->current, read_seq,
|
|
super_version->mutable_cf_options.max_sequential_skip_in_iterations,
|
|
super_version->version_number, read_callback);
|
|
auto internal_iter = NewInternalIterator(
|
|
db_iter->GetReadOptions(), cfd, super_version, db_iter->GetArena(),
|
|
db_iter->GetRangeDelAggregator(), read_seq,
|
|
/* allow_unprepared_value */ true);
|
|
db_iter->SetIterUnderDBIter(internal_iter);
|
|
return db_iter;
|
|
}
|
|
|
|
Status DBImplReadOnly::NewIterators(
|
|
const ReadOptions& read_options,
|
|
const std::vector<ColumnFamilyHandle*>& column_families,
|
|
std::vector<Iterator*>* iterators) {
|
|
if (read_options.timestamp) {
|
|
for (auto* cf : column_families) {
|
|
assert(cf);
|
|
const Status s = FailIfTsMismatchCf(cf, *(read_options.timestamp),
|
|
/*ts_for_read=*/true);
|
|
if (!s.ok()) {
|
|
return s;
|
|
}
|
|
}
|
|
} else {
|
|
for (auto* cf : column_families) {
|
|
assert(cf);
|
|
const Status s = FailIfCfHasTs(cf);
|
|
if (!s.ok()) {
|
|
return s;
|
|
}
|
|
}
|
|
}
|
|
|
|
ReadCallback* read_callback = nullptr; // No read callback provided.
|
|
if (iterators == nullptr) {
|
|
return Status::InvalidArgument("iterators not allowed to be nullptr");
|
|
}
|
|
iterators->clear();
|
|
iterators->reserve(column_families.size());
|
|
SequenceNumber latest_snapshot = versions_->LastSequence();
|
|
SequenceNumber read_seq =
|
|
read_options.snapshot != nullptr
|
|
? reinterpret_cast<const SnapshotImpl*>(read_options.snapshot)
|
|
->number_
|
|
: latest_snapshot;
|
|
|
|
for (auto cfh : column_families) {
|
|
auto* cfd = static_cast_with_check<ColumnFamilyHandleImpl>(cfh)->cfd();
|
|
auto* sv = cfd->GetSuperVersion()->Ref();
|
|
auto* db_iter = NewArenaWrappedDbIterator(
|
|
env_, read_options, *cfd->ioptions(), sv->mutable_cf_options,
|
|
sv->current, read_seq,
|
|
sv->mutable_cf_options.max_sequential_skip_in_iterations,
|
|
sv->version_number, read_callback);
|
|
auto* internal_iter = NewInternalIterator(
|
|
db_iter->GetReadOptions(), cfd, sv, db_iter->GetArena(),
|
|
db_iter->GetRangeDelAggregator(), read_seq,
|
|
/* allow_unprepared_value */ true);
|
|
db_iter->SetIterUnderDBIter(internal_iter);
|
|
iterators->push_back(db_iter);
|
|
}
|
|
|
|
return Status::OK();
|
|
}
|
|
|
|
namespace {
|
|
// Return OK if dbname exists in the file system or create it if
|
|
// create_if_missing
|
|
Status OpenForReadOnlyCheckExistence(const DBOptions& db_options,
|
|
const std::string& dbname) {
|
|
Status s;
|
|
if (!db_options.create_if_missing) {
|
|
// Attempt to read "CURRENT" file
|
|
const std::shared_ptr<FileSystem>& fs = db_options.env->GetFileSystem();
|
|
std::string manifest_path;
|
|
uint64_t manifest_file_number;
|
|
s = VersionSet::GetCurrentManifestPath(dbname, fs.get(), &manifest_path,
|
|
&manifest_file_number);
|
|
} else {
|
|
// Historic behavior that doesn't necessarily make sense
|
|
s = db_options.env->CreateDirIfMissing(dbname);
|
|
}
|
|
return s;
|
|
}
|
|
} // namespace
|
|
|
|
Status DB::OpenForReadOnly(const Options& options, const std::string& dbname,
|
|
DB** dbptr, bool /*error_if_wal_file_exists*/) {
|
|
Status s = OpenForReadOnlyCheckExistence(options, dbname);
|
|
if (!s.ok()) {
|
|
return s;
|
|
}
|
|
|
|
*dbptr = nullptr;
|
|
|
|
// Try to first open DB as fully compacted DB
|
|
s = CompactedDBImpl::Open(options, dbname, dbptr);
|
|
if (s.ok()) {
|
|
return s;
|
|
}
|
|
|
|
DBOptions db_options(options);
|
|
ColumnFamilyOptions cf_options(options);
|
|
std::vector<ColumnFamilyDescriptor> column_families;
|
|
column_families.push_back(
|
|
ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
|
|
std::vector<ColumnFamilyHandle*> handles;
|
|
|
|
s = DBImplReadOnly::OpenForReadOnlyWithoutCheck(
|
|
db_options, dbname, column_families, &handles, dbptr);
|
|
if (s.ok()) {
|
|
assert(handles.size() == 1);
|
|
// i can delete the handle since DBImpl is always holding a
|
|
// reference to default column family
|
|
delete handles[0];
|
|
}
|
|
return s;
|
|
}
|
|
|
|
Status DB::OpenForReadOnly(
|
|
const DBOptions& db_options, const std::string& dbname,
|
|
const std::vector<ColumnFamilyDescriptor>& column_families,
|
|
std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
|
|
bool error_if_wal_file_exists) {
|
|
// If dbname does not exist in the file system, should not do anything
|
|
Status s = OpenForReadOnlyCheckExistence(db_options, dbname);
|
|
if (!s.ok()) {
|
|
return s;
|
|
}
|
|
|
|
return DBImplReadOnly::OpenForReadOnlyWithoutCheck(
|
|
db_options, dbname, column_families, handles, dbptr,
|
|
error_if_wal_file_exists);
|
|
}
|
|
|
|
Status DBImplReadOnly::OpenForReadOnlyWithoutCheck(
|
|
const DBOptions& db_options, const std::string& dbname,
|
|
const std::vector<ColumnFamilyDescriptor>& column_families,
|
|
std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
|
|
bool error_if_wal_file_exists) {
|
|
*dbptr = nullptr;
|
|
handles->clear();
|
|
|
|
SuperVersionContext sv_context(/* create_superversion */ true);
|
|
DBImplReadOnly* impl = new DBImplReadOnly(db_options, dbname);
|
|
impl->mutex_.Lock();
|
|
Status s = impl->Recover(column_families, true /* read only */,
|
|
error_if_wal_file_exists);
|
|
if (s.ok()) {
|
|
// set column family handles
|
|
for (auto cf : column_families) {
|
|
auto cfd =
|
|
impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name);
|
|
if (cfd == nullptr) {
|
|
s = Status::InvalidArgument("Column family not found", cf.name);
|
|
break;
|
|
}
|
|
handles->push_back(new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_));
|
|
}
|
|
}
|
|
if (s.ok()) {
|
|
for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
|
|
sv_context.NewSuperVersion();
|
|
cfd->InstallSuperVersion(&sv_context, &impl->mutex_);
|
|
}
|
|
}
|
|
impl->mutex_.Unlock();
|
|
sv_context.Clean();
|
|
if (s.ok()) {
|
|
*dbptr = impl;
|
|
for (auto* h : *handles) {
|
|
impl->NewThreadStatusCfInfo(
|
|
static_cast_with_check<ColumnFamilyHandleImpl>(h)->cfd());
|
|
}
|
|
} else {
|
|
for (auto h : *handles) {
|
|
delete h;
|
|
}
|
|
handles->clear();
|
|
delete impl;
|
|
}
|
|
return s;
|
|
}
|
|
|
|
#else // !ROCKSDB_LITE
|
|
|
|
Status DB::OpenForReadOnly(const Options& /*options*/,
|
|
const std::string& /*dbname*/, DB** /*dbptr*/,
|
|
bool /*error_if_wal_file_exists*/) {
|
|
return Status::NotSupported("Not supported in ROCKSDB_LITE.");
|
|
}
|
|
|
|
Status DB::OpenForReadOnly(
|
|
const DBOptions& /*db_options*/, const std::string& /*dbname*/,
|
|
const std::vector<ColumnFamilyDescriptor>& /*column_families*/,
|
|
std::vector<ColumnFamilyHandle*>* /*handles*/, DB** /*dbptr*/,
|
|
bool /*error_if_wal_file_exists*/) {
|
|
return Status::NotSupported("Not supported in ROCKSDB_LITE.");
|
|
}
|
|
#endif // !ROCKSDB_LITE
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|