mirror of
https://github.com/facebook/rocksdb.git
synced 2024-11-26 07:30:54 +00:00
Blob DB: Store blob index as kTypeBlobIndex in base db
Summary: Blob db insert blob index to base db as kTypeBlobIndex type, to tell apart values written by plain rocksdb or blob db. This is to make it possible to migrate from existing rocksdb to blob db. Also with the patch blob db garbage collection get away from OptimisticTransaction. Instead it use a custom write callback to achieve similar behavior as OptimisticTransaction. This is because we need to pass the is_blob_index flag to DBImpl::Get but OptimisticTransaction don't support it. Closes https://github.com/facebook/rocksdb/pull/3000 Differential Revision: D6050044 Pulled By: yiwu-arbug fbshipit-source-id: 61dc72ab9977625e75f78cd968e7d8a3976e3632
This commit is contained in:
parent
0552029b5c
commit
eaaef91178
|
@ -2556,7 +2556,8 @@ SequenceNumber DBImpl::GetEarliestMemTableSequenceNumber(SuperVersion* sv,
|
||||||
#ifndef ROCKSDB_LITE
|
#ifndef ROCKSDB_LITE
|
||||||
Status DBImpl::GetLatestSequenceForKey(SuperVersion* sv, const Slice& key,
|
Status DBImpl::GetLatestSequenceForKey(SuperVersion* sv, const Slice& key,
|
||||||
bool cache_only, SequenceNumber* seq,
|
bool cache_only, SequenceNumber* seq,
|
||||||
bool* found_record_for_key) {
|
bool* found_record_for_key,
|
||||||
|
bool* is_blob_index) {
|
||||||
Status s;
|
Status s;
|
||||||
MergeContext merge_context;
|
MergeContext merge_context;
|
||||||
RangeDelAggregator range_del_agg(sv->mem->GetInternalKeyComparator(),
|
RangeDelAggregator range_del_agg(sv->mem->GetInternalKeyComparator(),
|
||||||
|
@ -2571,7 +2572,7 @@ Status DBImpl::GetLatestSequenceForKey(SuperVersion* sv, const Slice& key,
|
||||||
|
|
||||||
// Check if there is a record for this key in the latest memtable
|
// Check if there is a record for this key in the latest memtable
|
||||||
sv->mem->Get(lkey, nullptr, &s, &merge_context, &range_del_agg, seq,
|
sv->mem->Get(lkey, nullptr, &s, &merge_context, &range_del_agg, seq,
|
||||||
read_options);
|
read_options, nullptr /*read_callback*/, is_blob_index);
|
||||||
|
|
||||||
if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
|
if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
|
||||||
// unexpected error reading memtable.
|
// unexpected error reading memtable.
|
||||||
|
@ -2590,7 +2591,7 @@ Status DBImpl::GetLatestSequenceForKey(SuperVersion* sv, const Slice& key,
|
||||||
|
|
||||||
// Check if there is a record for this key in the immutable memtables
|
// Check if there is a record for this key in the immutable memtables
|
||||||
sv->imm->Get(lkey, nullptr, &s, &merge_context, &range_del_agg, seq,
|
sv->imm->Get(lkey, nullptr, &s, &merge_context, &range_del_agg, seq,
|
||||||
read_options);
|
read_options, nullptr /*read_callback*/, is_blob_index);
|
||||||
|
|
||||||
if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
|
if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
|
||||||
// unexpected error reading memtable.
|
// unexpected error reading memtable.
|
||||||
|
@ -2609,7 +2610,7 @@ Status DBImpl::GetLatestSequenceForKey(SuperVersion* sv, const Slice& key,
|
||||||
|
|
||||||
// Check if there is a record for this key in the immutable memtables
|
// Check if there is a record for this key in the immutable memtables
|
||||||
sv->imm->GetFromHistory(lkey, nullptr, &s, &merge_context, &range_del_agg,
|
sv->imm->GetFromHistory(lkey, nullptr, &s, &merge_context, &range_del_agg,
|
||||||
seq, read_options);
|
seq, read_options, is_blob_index);
|
||||||
|
|
||||||
if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
|
if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
|
||||||
// unexpected error reading memtable.
|
// unexpected error reading memtable.
|
||||||
|
@ -2633,7 +2634,8 @@ Status DBImpl::GetLatestSequenceForKey(SuperVersion* sv, const Slice& key,
|
||||||
// Check tables
|
// Check tables
|
||||||
sv->current->Get(read_options, lkey, nullptr, &s, &merge_context,
|
sv->current->Get(read_options, lkey, nullptr, &s, &merge_context,
|
||||||
&range_del_agg, nullptr /* value_found */,
|
&range_del_agg, nullptr /* value_found */,
|
||||||
found_record_for_key, seq);
|
found_record_for_key, seq, nullptr /*read_callback*/,
|
||||||
|
is_blob_index);
|
||||||
|
|
||||||
if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
|
if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
|
||||||
// unexpected error reading SST files
|
// unexpected error reading SST files
|
||||||
|
|
18
db/db_impl.h
18
db/db_impl.h
|
@ -96,6 +96,14 @@ class DBImpl : public DB {
|
||||||
virtual Status Get(const ReadOptions& options,
|
virtual Status Get(const ReadOptions& options,
|
||||||
ColumnFamilyHandle* column_family, const Slice& key,
|
ColumnFamilyHandle* column_family, const Slice& key,
|
||||||
PinnableSlice* value) override;
|
PinnableSlice* value) override;
|
||||||
|
|
||||||
|
// Function that Get and KeyMayExist call with no_io true or false
|
||||||
|
// Note: 'value_found' from KeyMayExist propagates here
|
||||||
|
Status GetImpl(const ReadOptions& options, ColumnFamilyHandle* column_family,
|
||||||
|
const Slice& key, PinnableSlice* value,
|
||||||
|
bool* value_found = nullptr, ReadCallback* callback = nullptr,
|
||||||
|
bool* is_blob_index = nullptr);
|
||||||
|
|
||||||
using DB::MultiGet;
|
using DB::MultiGet;
|
||||||
virtual std::vector<Status> MultiGet(
|
virtual std::vector<Status> MultiGet(
|
||||||
const ReadOptions& options,
|
const ReadOptions& options,
|
||||||
|
@ -295,7 +303,8 @@ class DBImpl : public DB {
|
||||||
// TODO(andrewkr): this API need to be aware of range deletion operations
|
// TODO(andrewkr): this API need to be aware of range deletion operations
|
||||||
Status GetLatestSequenceForKey(SuperVersion* sv, const Slice& key,
|
Status GetLatestSequenceForKey(SuperVersion* sv, const Slice& key,
|
||||||
bool cache_only, SequenceNumber* seq,
|
bool cache_only, SequenceNumber* seq,
|
||||||
bool* found_record_for_key);
|
bool* found_record_for_key,
|
||||||
|
bool* is_blob_index = nullptr);
|
||||||
|
|
||||||
using DB::IngestExternalFile;
|
using DB::IngestExternalFile;
|
||||||
virtual Status IngestExternalFile(
|
virtual Status IngestExternalFile(
|
||||||
|
@ -1272,13 +1281,6 @@ class DBImpl : public DB {
|
||||||
|
|
||||||
#endif // ROCKSDB_LITE
|
#endif // ROCKSDB_LITE
|
||||||
|
|
||||||
// Function that Get and KeyMayExist call with no_io true or false
|
|
||||||
// Note: 'value_found' from KeyMayExist propagates here
|
|
||||||
Status GetImpl(const ReadOptions& options, ColumnFamilyHandle* column_family,
|
|
||||||
const Slice& key, PinnableSlice* value,
|
|
||||||
bool* value_found = nullptr, ReadCallback* callback = nullptr,
|
|
||||||
bool* is_blob_index = nullptr);
|
|
||||||
|
|
||||||
bool GetIntPropertyInternal(ColumnFamilyData* cfd,
|
bool GetIntPropertyInternal(ColumnFamilyData* cfd,
|
||||||
const DBPropertyInfo& property_info,
|
const DBPropertyInfo& property_info,
|
||||||
bool is_locked, uint64_t* value);
|
bool is_locked, uint64_t* value);
|
||||||
|
|
|
@ -109,14 +109,13 @@ bool MemTableListVersion::Get(const LookupKey& key, std::string* value,
|
||||||
seq, read_opts, callback, is_blob_index);
|
seq, read_opts, callback, is_blob_index);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool MemTableListVersion::GetFromHistory(const LookupKey& key,
|
bool MemTableListVersion::GetFromHistory(
|
||||||
std::string* value, Status* s,
|
const LookupKey& key, std::string* value, Status* s,
|
||||||
MergeContext* merge_context,
|
MergeContext* merge_context, RangeDelAggregator* range_del_agg,
|
||||||
RangeDelAggregator* range_del_agg,
|
SequenceNumber* seq, const ReadOptions& read_opts, bool* is_blob_index) {
|
||||||
SequenceNumber* seq,
|
|
||||||
const ReadOptions& read_opts) {
|
|
||||||
return GetFromList(&memlist_history_, key, value, s, merge_context,
|
return GetFromList(&memlist_history_, key, value, s, merge_context,
|
||||||
range_del_agg, seq, read_opts);
|
range_del_agg, seq, read_opts, nullptr /*read_callback*/,
|
||||||
|
is_blob_index);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool MemTableListVersion::GetFromList(
|
bool MemTableListVersion::GetFromList(
|
||||||
|
|
|
@ -73,14 +73,16 @@ class MemTableListVersion {
|
||||||
bool GetFromHistory(const LookupKey& key, std::string* value, Status* s,
|
bool GetFromHistory(const LookupKey& key, std::string* value, Status* s,
|
||||||
MergeContext* merge_context,
|
MergeContext* merge_context,
|
||||||
RangeDelAggregator* range_del_agg, SequenceNumber* seq,
|
RangeDelAggregator* range_del_agg, SequenceNumber* seq,
|
||||||
const ReadOptions& read_opts);
|
const ReadOptions& read_opts,
|
||||||
|
bool* is_blob_index = nullptr);
|
||||||
bool GetFromHistory(const LookupKey& key, std::string* value, Status* s,
|
bool GetFromHistory(const LookupKey& key, std::string* value, Status* s,
|
||||||
MergeContext* merge_context,
|
MergeContext* merge_context,
|
||||||
RangeDelAggregator* range_del_agg,
|
RangeDelAggregator* range_del_agg,
|
||||||
const ReadOptions& read_opts) {
|
const ReadOptions& read_opts,
|
||||||
|
bool* is_blob_index = nullptr) {
|
||||||
SequenceNumber seq;
|
SequenceNumber seq;
|
||||||
return GetFromHistory(key, value, s, merge_context, range_del_agg, &seq,
|
return GetFromHistory(key, value, s, merge_context, range_del_agg, &seq,
|
||||||
read_opts);
|
read_opts, is_blob_index);
|
||||||
}
|
}
|
||||||
|
|
||||||
Status AddRangeTombstoneIterators(const ReadOptions& read_opts, Arena* arena,
|
Status AddRangeTombstoneIterators(const ReadOptions& read_opts, Arena* arena,
|
||||||
|
|
|
@ -32,8 +32,7 @@
|
||||||
#include "util/random.h"
|
#include "util/random.h"
|
||||||
#include "util/sync_point.h"
|
#include "util/sync_point.h"
|
||||||
#include "util/timer_queue.h"
|
#include "util/timer_queue.h"
|
||||||
#include "utilities/transactions/optimistic_transaction.h"
|
#include "utilities/blob_db/blob_db_iterator.h"
|
||||||
#include "utilities/transactions/optimistic_transaction_db_impl.h"
|
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
int kBlockBasedTableVersionFormat = 2;
|
int kBlockBasedTableVersionFormat = 2;
|
||||||
|
@ -78,7 +77,7 @@ class BlobHandle {
|
||||||
|
|
||||||
void EncodeTo(std::string* dst) const;
|
void EncodeTo(std::string* dst) const;
|
||||||
|
|
||||||
Status DecodeFrom(Slice* input);
|
Status DecodeFrom(const Slice& input);
|
||||||
|
|
||||||
void clear();
|
void clear();
|
||||||
|
|
||||||
|
@ -109,10 +108,12 @@ void BlobHandle::clear() {
|
||||||
compression_ = kNoCompression;
|
compression_ = kNoCompression;
|
||||||
}
|
}
|
||||||
|
|
||||||
Status BlobHandle::DecodeFrom(Slice* input) {
|
Status BlobHandle::DecodeFrom(const Slice& input) {
|
||||||
if (GetVarint64(input, &file_number_) && GetVarint64(input, &offset_) &&
|
Slice s(input);
|
||||||
GetVarint64(input, &size_)) {
|
Slice* p = &s;
|
||||||
compression_ = static_cast<CompressionType>(input->data()[0]);
|
if (GetVarint64(p, &file_number_) && GetVarint64(p, &offset_) &&
|
||||||
|
GetVarint64(p, &size_)) {
|
||||||
|
compression_ = static_cast<CompressionType>(p->data()[0]);
|
||||||
return Status::OK();
|
return Status::OK();
|
||||||
} else {
|
} else {
|
||||||
clear();
|
clear();
|
||||||
|
@ -149,8 +150,7 @@ void EvictAllVersionsCompactionListener::InternalListener::OnCompaction(
|
||||||
value_type ==
|
value_type ==
|
||||||
CompactionEventListener::CompactionListenerValueType::kValue) {
|
CompactionEventListener::CompactionListenerValueType::kValue) {
|
||||||
BlobHandle handle;
|
BlobHandle handle;
|
||||||
Slice lsmval(existing_value);
|
Status s = handle.DecodeFrom(existing_value);
|
||||||
Status s = handle.DecodeFrom(&lsmval);
|
|
||||||
if (s.ok()) {
|
if (s.ok()) {
|
||||||
if (impl_->debug_level_ >= 3)
|
if (impl_->debug_level_ >= 3)
|
||||||
ROCKS_LOG_INFO(impl_->db_options_.info_log,
|
ROCKS_LOG_INFO(impl_->db_options_.info_log,
|
||||||
|
@ -211,8 +211,6 @@ Status BlobDBImpl::LinkToBaseDB(DB* db) {
|
||||||
|
|
||||||
env_ = db_->GetEnv();
|
env_ = db_->GetEnv();
|
||||||
|
|
||||||
opt_db_.reset(new OptimisticTransactionDBImpl(db, false));
|
|
||||||
|
|
||||||
Status s = env_->CreateDirIfMissing(blob_dir_);
|
Status s = env_->CreateDirIfMissing(blob_dir_);
|
||||||
if (!s.ok()) {
|
if (!s.ok()) {
|
||||||
ROCKS_LOG_WARN(db_options_.info_log,
|
ROCKS_LOG_WARN(db_options_.info_log,
|
||||||
|
@ -237,7 +235,6 @@ BlobDBOptions BlobDBImpl::GetBlobDBOptions() const { return bdb_options_; }
|
||||||
BlobDBImpl::BlobDBImpl(DB* db, const BlobDBOptions& blob_db_options)
|
BlobDBImpl::BlobDBImpl(DB* db, const BlobDBOptions& blob_db_options)
|
||||||
: BlobDB(db),
|
: BlobDB(db),
|
||||||
db_impl_(static_cast_with_check<DBImpl, DB>(db)),
|
db_impl_(static_cast_with_check<DBImpl, DB>(db)),
|
||||||
opt_db_(new OptimisticTransactionDBImpl(db, false)),
|
|
||||||
wo_set_(false),
|
wo_set_(false),
|
||||||
bdb_options_(blob_db_options),
|
bdb_options_(blob_db_options),
|
||||||
db_options_(db->GetOptions()),
|
db_options_(db->GetOptions()),
|
||||||
|
@ -827,8 +824,8 @@ Status BlobDBImpl::Write(const WriteOptions& opts, WriteBatch* updates) {
|
||||||
extendTTL(&(bfile->ttl_range_), expiration);
|
extendTTL(&(bfile->ttl_range_), expiration);
|
||||||
}
|
}
|
||||||
|
|
||||||
return WriteBatchInternal::Put(&updates_blob_, column_family_id, key,
|
return WriteBatchInternal::PutBlobIndex(&updates_blob_, column_family_id,
|
||||||
index_entry);
|
key, index_entry);
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual Status DeleteCF(uint32_t column_family_id,
|
virtual Status DeleteCF(uint32_t column_family_id,
|
||||||
|
@ -997,18 +994,6 @@ Status BlobDBImpl::PutUntil(const WriteOptions& options, const Slice& key,
|
||||||
std::string headerbuf;
|
std::string headerbuf;
|
||||||
Writer::ConstructBlobHeader(&headerbuf, key, value, expiration, -1);
|
Writer::ConstructBlobHeader(&headerbuf, key, value, expiration, -1);
|
||||||
|
|
||||||
// this is another more safer way to do it, where you keep the writeLock
|
|
||||||
// for the entire write path. this will increase latency and reduce
|
|
||||||
// throughput
|
|
||||||
// WriteLock lockbfile_w(&bfile->mutex_);
|
|
||||||
// std::shared_ptr<Writer> writer =
|
|
||||||
// CheckOrCreateWriterLocked(bfile);
|
|
||||||
|
|
||||||
if (debug_level_ >= 3)
|
|
||||||
ROCKS_LOG_DEBUG(
|
|
||||||
db_options_.info_log, ">Adding KEY FILE: %s: KEY: %s VALSZ: %d",
|
|
||||||
bfile->PathName().c_str(), key.ToString().c_str(), value.size());
|
|
||||||
|
|
||||||
std::string index_entry;
|
std::string index_entry;
|
||||||
Status s = AppendBlob(bfile, headerbuf, key, value, &index_entry);
|
Status s = AppendBlob(bfile, headerbuf, key, value, &index_entry);
|
||||||
if (!s.ok()) {
|
if (!s.ok()) {
|
||||||
|
@ -1022,11 +1007,17 @@ Status BlobDBImpl::PutUntil(const WriteOptions& options, const Slice& key,
|
||||||
}
|
}
|
||||||
|
|
||||||
WriteBatch batch;
|
WriteBatch batch;
|
||||||
batch.Put(key, index_entry);
|
uint32_t column_family_id =
|
||||||
|
reinterpret_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily())->GetID();
|
||||||
|
s = WriteBatchInternal::PutBlobIndex(&batch, column_family_id, key,
|
||||||
|
index_entry);
|
||||||
|
|
||||||
// this goes to the base db and can be expensive
|
// this goes to the base db and can be expensive
|
||||||
|
if (s.ok()) {
|
||||||
s = db_->Write(options, &batch);
|
s = db_->Write(options, &batch);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (s.ok()) {
|
||||||
// this is the sequence number of the write.
|
// this is the sequence number of the write.
|
||||||
SequenceNumber sn = WriteBatchInternal::Sequence(&batch);
|
SequenceNumber sn = WriteBatchInternal::Sequence(&batch);
|
||||||
bfile->ExtendSequenceRange(sn);
|
bfile->ExtendSequenceRange(sn);
|
||||||
|
@ -1035,7 +1026,6 @@ Status BlobDBImpl::PutUntil(const WriteOptions& options, const Slice& key,
|
||||||
extendTTL(&(bfile->ttl_range_), expiration);
|
extendTTL(&(bfile->ttl_range_), expiration);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (s.ok()) {
|
|
||||||
s = CloseBlobFileIfNeeded(bfile);
|
s = CloseBlobFileIfNeeded(bfile);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1112,21 +1102,16 @@ std::vector<Status> BlobDBImpl::MultiGet(
|
||||||
// fetch and index entry and reading from the file.
|
// fetch and index entry and reading from the file.
|
||||||
ReadOptions ro(read_options);
|
ReadOptions ro(read_options);
|
||||||
bool snapshot_created = SetSnapshotIfNeeded(&ro);
|
bool snapshot_created = SetSnapshotIfNeeded(&ro);
|
||||||
std::vector<std::string> values_lsm;
|
|
||||||
values_lsm.resize(keys.size());
|
|
||||||
auto statuses = db_->MultiGet(ro, keys, &values_lsm);
|
|
||||||
TEST_SYNC_POINT("BlobDBImpl::MultiGet:AfterIndexEntryGet:1");
|
|
||||||
TEST_SYNC_POINT("BlobDBImpl::MultiGet:AfterIndexEntryGet:2");
|
|
||||||
|
|
||||||
values->resize(keys.size());
|
std::vector<Status> statuses;
|
||||||
assert(statuses.size() == keys.size());
|
statuses.reserve(keys.size());
|
||||||
assert(values_lsm.size() == keys.size());
|
values->clear();
|
||||||
for (size_t i = 0; i < keys.size(); ++i) {
|
values->reserve(keys.size());
|
||||||
if (!statuses[i].ok()) {
|
PinnableSlice value;
|
||||||
continue;
|
for (size_t i = 0; i < keys.size(); i++) {
|
||||||
}
|
statuses.push_back(Get(ro, DefaultColumnFamily(), keys[i], &value));
|
||||||
Status s = CommonGet(keys[i], values_lsm[i], &((*values)[i]));
|
values->push_back(value.ToString());
|
||||||
statuses[i] = s;
|
value.Reset();
|
||||||
}
|
}
|
||||||
if (snapshot_created) {
|
if (snapshot_created) {
|
||||||
db_->ReleaseSnapshot(ro.snapshot);
|
db_->ReleaseSnapshot(ro.snapshot);
|
||||||
|
@ -1143,12 +1128,11 @@ bool BlobDBImpl::SetSnapshotIfNeeded(ReadOptions* read_options) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
Status BlobDBImpl::CommonGet(const Slice& key, const std::string& index_entry,
|
Status BlobDBImpl::GetBlobValue(const Slice& key, const Slice& index_entry,
|
||||||
std::string* value) {
|
PinnableSlice* value) {
|
||||||
assert(value != nullptr);
|
assert(value != nullptr);
|
||||||
Slice index_entry_slice(index_entry);
|
|
||||||
BlobHandle handle;
|
BlobHandle handle;
|
||||||
Status s = handle.DecodeFrom(&index_entry_slice);
|
Status s = handle.DecodeFrom(index_entry);
|
||||||
if (!s.ok()) return s;
|
if (!s.ok()) return s;
|
||||||
|
|
||||||
// offset has to have certain min, as we will read CRC
|
// offset has to have certain min, as we will read CRC
|
||||||
|
@ -1179,9 +1163,8 @@ Status BlobDBImpl::CommonGet(const Slice& key, const std::string& index_entry,
|
||||||
bfile = hitr->second;
|
bfile = hitr->second;
|
||||||
}
|
}
|
||||||
|
|
||||||
// 0 - size
|
if (handle.size() == 0 && value != nullptr) {
|
||||||
if (!handle.size() && value != nullptr) {
|
value->PinSelf("");
|
||||||
value->clear();
|
|
||||||
return Status::OK();
|
return Status::OK();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1189,7 +1172,7 @@ Status BlobDBImpl::CommonGet(const Slice& key, const std::string& index_entry,
|
||||||
std::shared_ptr<RandomAccessFileReader> reader =
|
std::shared_ptr<RandomAccessFileReader> reader =
|
||||||
GetOrOpenRandomAccessReader(bfile, env_, env_options_);
|
GetOrOpenRandomAccessReader(bfile, env_, env_options_);
|
||||||
|
|
||||||
std::string* valueptr = value;
|
std::string* valueptr = value->GetSelf();
|
||||||
std::string value_c;
|
std::string value_c;
|
||||||
if (bdb_options_.compression != kNoCompression) {
|
if (bdb_options_.compression != kNoCompression) {
|
||||||
valueptr = &value_c;
|
valueptr = &value_c;
|
||||||
|
@ -1251,9 +1234,11 @@ Status BlobDBImpl::CommonGet(const Slice& key, const std::string& index_entry,
|
||||||
blob_value.data(), blob_value.size(), &contents,
|
blob_value.data(), blob_value.size(), &contents,
|
||||||
kBlockBasedTableVersionFormat, Slice(), bdb_options_.compression,
|
kBlockBasedTableVersionFormat, Slice(), bdb_options_.compression,
|
||||||
*(cfh->cfd()->ioptions()));
|
*(cfh->cfd()->ioptions()));
|
||||||
*value = contents.data.ToString();
|
*(value->GetSelf()) = contents.data.ToString();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
value->PinSelf();
|
||||||
|
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1271,13 +1256,16 @@ Status BlobDBImpl::Get(const ReadOptions& read_options,
|
||||||
bool snapshot_created = SetSnapshotIfNeeded(&ro);
|
bool snapshot_created = SetSnapshotIfNeeded(&ro);
|
||||||
|
|
||||||
Status s;
|
Status s;
|
||||||
std::string index_entry;
|
bool is_blob_index = false;
|
||||||
s = db_->Get(ro, key, &index_entry);
|
s = db_impl_->GetImpl(ro, column_family, key, value, nullptr /*value_found*/,
|
||||||
|
nullptr /*read_callback*/, &is_blob_index);
|
||||||
TEST_SYNC_POINT("BlobDBImpl::Get:AfterIndexEntryGet:1");
|
TEST_SYNC_POINT("BlobDBImpl::Get:AfterIndexEntryGet:1");
|
||||||
TEST_SYNC_POINT("BlobDBImpl::Get:AfterIndexEntryGet:2");
|
TEST_SYNC_POINT("BlobDBImpl::Get:AfterIndexEntryGet:2");
|
||||||
if (s.ok()) {
|
if (s.ok()) {
|
||||||
s = CommonGet(key, index_entry, value->GetSelf());
|
if (is_blob_index) {
|
||||||
value->PinSelf();
|
PinnableSlice index_entry = std::move(*value);
|
||||||
|
s = GetBlobValue(key, index_entry, value);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if (snapshot_created) {
|
if (snapshot_created) {
|
||||||
db_->ReleaseSnapshot(ro.snapshot);
|
db_->ReleaseSnapshot(ro.snapshot);
|
||||||
|
@ -1285,15 +1273,6 @@ Status BlobDBImpl::Get(const ReadOptions& read_options,
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
Slice BlobDBIterator::value() const {
|
|
||||||
TEST_SYNC_POINT("BlobDBIterator::value:BeforeGetBlob:1");
|
|
||||||
TEST_SYNC_POINT("BlobDBIterator::value:BeforeGetBlob:2");
|
|
||||||
Slice index_entry = iter_->value();
|
|
||||||
Status s =
|
|
||||||
db_impl_->CommonGet(iter_->key(), index_entry.ToString(false), &vpart_);
|
|
||||||
return Slice(vpart_);
|
|
||||||
}
|
|
||||||
|
|
||||||
std::pair<bool, int64_t> BlobDBImpl::SanityCheck(bool aborted) {
|
std::pair<bool, int64_t> BlobDBImpl::SanityCheck(bool aborted) {
|
||||||
if (aborted) return std::make_pair(false, -1);
|
if (aborted) return std::make_pair(false, -1);
|
||||||
|
|
||||||
|
@ -1411,14 +1390,13 @@ bool BlobDBImpl::FindFileAndEvictABlob(uint64_t file_number, uint64_t key_size,
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool BlobDBImpl::MarkBlobDeleted(const Slice& key, const Slice& lsmValue) {
|
bool BlobDBImpl::MarkBlobDeleted(const Slice& key, const Slice& index_entry) {
|
||||||
Slice val(lsmValue);
|
|
||||||
BlobHandle handle;
|
BlobHandle handle;
|
||||||
Status s = handle.DecodeFrom(&val);
|
Status s = handle.DecodeFrom(index_entry);
|
||||||
if (!s.ok()) {
|
if (!s.ok()) {
|
||||||
ROCKS_LOG_INFO(db_options_.info_log,
|
ROCKS_LOG_INFO(db_options_.info_log,
|
||||||
"Could not parse lsm val in MarkBlobDeleted %s",
|
"Could not parse lsm val in MarkBlobDeleted %s",
|
||||||
lsmValue.ToString().c_str());
|
index_entry.ToString().c_str());
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
bool succ = FindFileAndEvictABlob(handle.filenumber(), key.size(),
|
bool succ = FindFileAndEvictABlob(handle.filenumber(), key.size(),
|
||||||
|
@ -1618,7 +1596,52 @@ std::pair<bool, int64_t> BlobDBImpl::WaStats(bool aborted) {
|
||||||
return std::make_pair(true, -1);
|
return std::make_pair(true, -1);
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
// Write callback for garbage collection to check if key has been updated
|
||||||
|
// since last read. Similar to how OptimisticTransaction works. See inline
|
||||||
|
// comment in GCFileAndUpdateLSM().
|
||||||
|
class BlobDBImpl::GarbageCollectionWriteCallback : public WriteCallback {
|
||||||
|
public:
|
||||||
|
GarbageCollectionWriteCallback(ColumnFamilyData* cfd, const Slice& key,
|
||||||
|
SequenceNumber upper_bound)
|
||||||
|
: cfd_(cfd), key_(key), upper_bound_(upper_bound) {}
|
||||||
|
|
||||||
|
virtual Status Callback(DB* db) override {
|
||||||
|
auto* db_impl = reinterpret_cast<DBImpl*>(db);
|
||||||
|
auto* sv = db_impl->GetAndRefSuperVersion(cfd_);
|
||||||
|
SequenceNumber latest_seq = 0;
|
||||||
|
bool found_record_for_key = false;
|
||||||
|
bool is_blob_index = false;
|
||||||
|
Status s = db_impl->GetLatestSequenceForKey(
|
||||||
|
sv, key_, false /*cache_only*/, &latest_seq, &found_record_for_key,
|
||||||
|
&is_blob_index);
|
||||||
|
db_impl->ReturnAndCleanupSuperVersion(cfd_, sv);
|
||||||
|
if (!s.ok() && !s.IsNotFound()) {
|
||||||
|
// Error.
|
||||||
|
assert(!s.IsBusy());
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
if (s.IsNotFound()) {
|
||||||
|
assert(!found_record_for_key);
|
||||||
|
return Status::Busy("Key deleted");
|
||||||
|
}
|
||||||
|
assert(found_record_for_key);
|
||||||
|
assert(is_blob_index);
|
||||||
|
if (latest_seq > upper_bound_) {
|
||||||
|
return Status::Busy("Key overwritten");
|
||||||
|
}
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
virtual bool AllowWriteBatching() override { return false; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
ColumnFamilyData* cfd_;
|
||||||
|
// Key to check
|
||||||
|
Slice key_;
|
||||||
|
// Upper bound of sequence number to proceed.
|
||||||
|
SequenceNumber upper_bound_;
|
||||||
|
};
|
||||||
|
|
||||||
// iterate over the blobs sequentially and check if the blob sequence number
|
// iterate over the blobs sequentially and check if the blob sequence number
|
||||||
// is the latest. If it is the latest, preserve it, otherwise delete it
|
// is the latest. If it is the latest, preserve it, otherwise delete it
|
||||||
// if it is TTL based, and the TTL has expired, then
|
// if it is TTL based, and the TTL has expired, then
|
||||||
|
@ -1631,7 +1654,6 @@ std::pair<bool, int64_t> BlobDBImpl::WaStats(bool aborted) {
|
||||||
//
|
//
|
||||||
// if it is not TTL based, then we can blow the key if the key has been
|
// if it is not TTL based, then we can blow the key if the key has been
|
||||||
// DELETED in the LSM
|
// DELETED in the LSM
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
|
||||||
Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
|
Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
|
||||||
GCStats* gc_stats) {
|
GCStats* gc_stats) {
|
||||||
uint64_t now = EpochNow();
|
uint64_t now = EpochNow();
|
||||||
|
@ -1656,14 +1678,14 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
|
||||||
|
|
||||||
bool first_gc = bfptr->gc_once_after_open_;
|
bool first_gc = bfptr->gc_once_after_open_;
|
||||||
|
|
||||||
ColumnFamilyHandle* cfh = bfptr->GetColumnFamily(db_);
|
auto* cfh = bfptr->GetColumnFamily(db_);
|
||||||
|
auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(cfh)->cfd();
|
||||||
|
auto column_family_id = cfd->GetID();
|
||||||
bool has_ttl = header.HasTTL();
|
bool has_ttl = header.HasTTL();
|
||||||
|
|
||||||
// this reads the key but skips the blob
|
// this reads the key but skips the blob
|
||||||
Reader::ReadLevel shallow = Reader::kReadHeaderKey;
|
Reader::ReadLevel shallow = Reader::kReadHeaderKey;
|
||||||
|
|
||||||
assert(opt_db_);
|
|
||||||
|
|
||||||
bool no_relocation_ttl = (has_ttl && now >= bfptr->GetTTLRange().second);
|
bool no_relocation_ttl = (has_ttl && now >= bfptr->GetTTLRange().second);
|
||||||
|
|
||||||
bool no_relocation_lsmdel = false;
|
bool no_relocation_lsmdel = false;
|
||||||
|
@ -1683,23 +1705,11 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
|
||||||
BlobLogRecord record;
|
BlobLogRecord record;
|
||||||
std::shared_ptr<BlobFile> newfile;
|
std::shared_ptr<BlobFile> newfile;
|
||||||
std::shared_ptr<Writer> new_writer;
|
std::shared_ptr<Writer> new_writer;
|
||||||
Transaction* transaction = nullptr;
|
|
||||||
uint64_t blob_offset = 0;
|
uint64_t blob_offset = 0;
|
||||||
bool retry = false;
|
|
||||||
|
|
||||||
static const WriteOptions kGarbageCollectionWriteOptions = []() {
|
|
||||||
WriteOptions write_options;
|
|
||||||
// It is ok to ignore column families that were dropped.
|
|
||||||
write_options.ignore_missing_column_families = true;
|
|
||||||
return write_options;
|
|
||||||
}();
|
|
||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
assert(s.ok());
|
assert(s.ok());
|
||||||
if (retry) {
|
|
||||||
// Retry in case transaction fail with Status::TryAgain.
|
|
||||||
retry = false;
|
|
||||||
} else {
|
|
||||||
// Read the next blob record.
|
// Read the next blob record.
|
||||||
Status read_record_status =
|
Status read_record_status =
|
||||||
reader->ReadRecord(&record, shallow, &blob_offset);
|
reader->ReadRecord(&record, shallow, &blob_offset);
|
||||||
|
@ -1709,33 +1719,38 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
gc_stats->blob_count++;
|
gc_stats->blob_count++;
|
||||||
}
|
|
||||||
|
|
||||||
transaction =
|
// Similar to OptimisticTransaction, we obtain latest_seq from
|
||||||
opt_db_->BeginTransaction(kGarbageCollectionWriteOptions,
|
// base DB, which is guaranteed to be no smaller than the sequence of
|
||||||
OptimisticTransactionOptions(), transaction);
|
// current key. We use a WriteCallback on write to check the key sequence
|
||||||
|
// on write. If the key sequence is larger than latest_seq, we know
|
||||||
std::string index_entry;
|
// a new versions is inserted and the old blob can be disgard.
|
||||||
Status get_status = transaction->GetForUpdate(ReadOptions(), cfh,
|
//
|
||||||
record.Key(), &index_entry);
|
// We cannot use OptimisticTransaction because we need to pass
|
||||||
TEST_SYNC_POINT("BlobDBImpl::GCFileAndUpdateLSM:AfterGetForUpdate");
|
// is_blob_index flag to GetImpl.
|
||||||
if (get_status.IsNotFound()) {
|
SequenceNumber latest_seq = GetLatestSequenceNumber();
|
||||||
// Key has been deleted. Drop the blob record.
|
bool is_blob_index = false;
|
||||||
continue;
|
PinnableSlice index_entry;
|
||||||
}
|
Status get_status = db_impl_->GetImpl(
|
||||||
if (!get_status.ok()) {
|
ReadOptions(), cfh, record.Key(), &index_entry, nullptr /*value_found*/,
|
||||||
|
nullptr /*read_callback*/, &is_blob_index);
|
||||||
|
TEST_SYNC_POINT("BlobDBImpl::GCFileAndUpdateLSM:AfterGetFromBaseDB");
|
||||||
|
if (!get_status.ok() && !get_status.ok()) {
|
||||||
|
// error
|
||||||
s = get_status;
|
s = get_status;
|
||||||
ROCKS_LOG_ERROR(db_options_.info_log,
|
ROCKS_LOG_ERROR(db_options_.info_log,
|
||||||
"Error while getting index entry: %s",
|
"Error while getting index entry: %s",
|
||||||
s.ToString().c_str());
|
s.ToString().c_str());
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
if (get_status.IsNotFound() || !is_blob_index) {
|
||||||
|
// Either the key is deleted or updated with a newer version whish is
|
||||||
|
// inlined in LSM.
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
// TODO(yiwu): We should have an override of GetForUpdate returning a
|
|
||||||
// PinnableSlice.
|
|
||||||
Slice index_entry_slice(index_entry);
|
|
||||||
BlobHandle handle;
|
BlobHandle handle;
|
||||||
s = handle.DecodeFrom(&index_entry_slice);
|
s = handle.DecodeFrom(index_entry);
|
||||||
if (!s.ok()) {
|
if (!s.ok()) {
|
||||||
ROCKS_LOG_ERROR(db_options_.info_log,
|
ROCKS_LOG_ERROR(db_options_.info_log,
|
||||||
"Error while decoding index entry: %s",
|
"Error while decoding index entry: %s",
|
||||||
|
@ -1748,21 +1763,24 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
GarbageCollectionWriteCallback callback(cfd, record.Key(), latest_seq);
|
||||||
|
|
||||||
// If key has expired, remove it from base DB.
|
// If key has expired, remove it from base DB.
|
||||||
if (no_relocation_ttl || (has_ttl && now >= record.GetTTL())) {
|
if (no_relocation_ttl || (has_ttl && now >= record.GetTTL())) {
|
||||||
gc_stats->num_deletes++;
|
gc_stats->num_deletes++;
|
||||||
gc_stats->deleted_size += record.GetBlobSize();
|
gc_stats->deleted_size += record.GetBlobSize();
|
||||||
TEST_SYNC_POINT("BlobDBImpl::GCFileAndUpdateLSM:BeforeDelete");
|
TEST_SYNC_POINT("BlobDBImpl::GCFileAndUpdateLSM:BeforeDelete");
|
||||||
transaction->Delete(cfh, record.Key());
|
WriteBatch delete_batch;
|
||||||
Status delete_status = transaction->Commit();
|
Status delete_status = delete_batch.Delete(record.Key());
|
||||||
|
if (delete_status.ok()) {
|
||||||
|
delete_status = db_impl_->WriteWithCallback(WriteOptions(),
|
||||||
|
&delete_batch, &callback);
|
||||||
|
}
|
||||||
if (delete_status.ok()) {
|
if (delete_status.ok()) {
|
||||||
gc_stats->delete_succeeded++;
|
gc_stats->delete_succeeded++;
|
||||||
} else if (delete_status.IsBusy()) {
|
} else if (delete_status.IsBusy()) {
|
||||||
// The key is overwritten in the meanwhile. Drop the blob record.
|
// The key is overwritten in the meanwhile. Drop the blob record.
|
||||||
gc_stats->overwritten_while_delete++;
|
gc_stats->overwritten_while_delete++;
|
||||||
} else if (delete_status.IsTryAgain()) {
|
|
||||||
// Retry the transaction.
|
|
||||||
retry = true;
|
|
||||||
} else {
|
} else {
|
||||||
// We hit an error.
|
// We hit an error.
|
||||||
s = delete_status;
|
s = delete_status;
|
||||||
|
@ -1829,29 +1847,27 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
|
||||||
BlobLogRecord::kHeaderSize + record.Key().size() + record.Blob().size();
|
BlobLogRecord::kHeaderSize + record.Key().size() + record.Blob().size();
|
||||||
|
|
||||||
TEST_SYNC_POINT("BlobDBImpl::GCFileAndUpdateLSM:BeforeRelocate");
|
TEST_SYNC_POINT("BlobDBImpl::GCFileAndUpdateLSM:BeforeRelocate");
|
||||||
transaction->Put(cfh, record.Key(), new_index_entry);
|
WriteBatch rewrite_batch;
|
||||||
Status put_status = transaction->Commit();
|
Status rewrite_status = WriteBatchInternal::PutBlobIndex(
|
||||||
if (put_status.ok()) {
|
&rewrite_batch, column_family_id, record.Key(), new_index_entry);
|
||||||
|
if (rewrite_status.ok()) {
|
||||||
|
rewrite_status = db_impl_->WriteWithCallback(WriteOptions(),
|
||||||
|
&rewrite_batch, &callback);
|
||||||
|
}
|
||||||
|
if (rewrite_status.ok()) {
|
||||||
gc_stats->relocate_succeeded++;
|
gc_stats->relocate_succeeded++;
|
||||||
} else if (put_status.IsBusy()) {
|
} else if (rewrite_status.IsBusy()) {
|
||||||
// The key is overwritten in the meanwhile. Drop the blob record.
|
// The key is overwritten in the meanwhile. Drop the blob record.
|
||||||
gc_stats->overwritten_while_relocate++;
|
gc_stats->overwritten_while_relocate++;
|
||||||
} else if (put_status.IsTryAgain()) {
|
|
||||||
// Retry the transaction.
|
|
||||||
// TODO(yiwu): On retry, we can reuse the new blob record.
|
|
||||||
retry = true;
|
|
||||||
} else {
|
} else {
|
||||||
// We hit an error.
|
// We hit an error.
|
||||||
s = put_status;
|
s = rewrite_status;
|
||||||
ROCKS_LOG_ERROR(db_options_.info_log, "Error while relocating key: %s",
|
ROCKS_LOG_ERROR(db_options_.info_log, "Error while relocating key: %s",
|
||||||
s.ToString().c_str());
|
s.ToString().c_str());
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
} // end of ReadRecord loop
|
} // end of ReadRecord loop
|
||||||
|
|
||||||
if (transaction != nullptr) {
|
|
||||||
delete transaction;
|
|
||||||
}
|
|
||||||
ROCKS_LOG_INFO(
|
ROCKS_LOG_INFO(
|
||||||
db_options_.info_log,
|
db_options_.info_log,
|
||||||
"%s blob file %" PRIu64
|
"%s blob file %" PRIu64
|
||||||
|
@ -2195,12 +2211,20 @@ std::pair<bool, int64_t> BlobDBImpl::RunGC(bool aborted) {
|
||||||
}
|
}
|
||||||
|
|
||||||
Iterator* BlobDBImpl::NewIterator(const ReadOptions& read_options) {
|
Iterator* BlobDBImpl::NewIterator(const ReadOptions& read_options) {
|
||||||
|
auto* cfd =
|
||||||
|
reinterpret_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily())->cfd();
|
||||||
// Get a snapshot to avoid blob file get deleted between we
|
// Get a snapshot to avoid blob file get deleted between we
|
||||||
// fetch and index entry and reading from the file.
|
// fetch and index entry and reading from the file.
|
||||||
ReadOptions ro(read_options);
|
ManagedSnapshot* own_snapshot = nullptr;
|
||||||
bool snapshot_created = SetSnapshotIfNeeded(&ro);
|
const Snapshot* snapshot = read_options.snapshot;
|
||||||
return new BlobDBIterator(db_->NewIterator(ro), this, snapshot_created,
|
if (snapshot == nullptr) {
|
||||||
ro.snapshot);
|
own_snapshot = new ManagedSnapshot(db_);
|
||||||
|
snapshot = own_snapshot->snapshot();
|
||||||
|
}
|
||||||
|
auto* iter = db_impl_->NewIteratorImpl(
|
||||||
|
read_options, cfd, snapshot->GetSequenceNumber(),
|
||||||
|
nullptr /*read_callback*/, true /*allow_blob*/);
|
||||||
|
return new BlobDBIterator(own_snapshot, iter, this);
|
||||||
}
|
}
|
||||||
|
|
||||||
Status DestroyBlobDB(const std::string& dbname, const Options& options,
|
Status DestroyBlobDB(const std::string& dbname, const Options& options,
|
||||||
|
|
|
@ -18,6 +18,7 @@
|
||||||
#include <utility>
|
#include <utility>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
#include "db/db_iter.h"
|
||||||
#include "rocksdb/compaction_filter.h"
|
#include "rocksdb/compaction_filter.h"
|
||||||
#include "rocksdb/db.h"
|
#include "rocksdb/db.h"
|
||||||
#include "rocksdb/listener.h"
|
#include "rocksdb/listener.h"
|
||||||
|
@ -37,7 +38,6 @@ namespace rocksdb {
|
||||||
class DBImpl;
|
class DBImpl;
|
||||||
class ColumnFamilyHandle;
|
class ColumnFamilyHandle;
|
||||||
class ColumnFamilyData;
|
class ColumnFamilyData;
|
||||||
class OptimisticTransactionDBImpl;
|
|
||||||
struct FlushJobInfo;
|
struct FlushJobInfo;
|
||||||
|
|
||||||
namespace blob_db {
|
namespace blob_db {
|
||||||
|
@ -215,9 +215,20 @@ class BlobDBImpl : public BlobDB {
|
||||||
Status Get(const ReadOptions& read_options, ColumnFamilyHandle* column_family,
|
Status Get(const ReadOptions& read_options, ColumnFamilyHandle* column_family,
|
||||||
const Slice& key, PinnableSlice* value) override;
|
const Slice& key, PinnableSlice* value) override;
|
||||||
|
|
||||||
|
Status GetBlobValue(const Slice& key, const Slice& index_entry,
|
||||||
|
PinnableSlice* value);
|
||||||
|
|
||||||
using BlobDB::NewIterator;
|
using BlobDB::NewIterator;
|
||||||
virtual Iterator* NewIterator(const ReadOptions& read_options) override;
|
virtual Iterator* NewIterator(const ReadOptions& read_options) override;
|
||||||
|
|
||||||
|
using BlobDB::NewIterators;
|
||||||
|
virtual Status NewIterators(
|
||||||
|
const ReadOptions& read_options,
|
||||||
|
const std::vector<ColumnFamilyHandle*>& column_families,
|
||||||
|
std::vector<Iterator*>* iterators) override {
|
||||||
|
return Status::NotSupported("Not implemented");
|
||||||
|
}
|
||||||
|
|
||||||
using BlobDB::MultiGet;
|
using BlobDB::MultiGet;
|
||||||
virtual std::vector<Status> MultiGet(
|
virtual std::vector<Status> MultiGet(
|
||||||
const ReadOptions& read_options,
|
const ReadOptions& read_options,
|
||||||
|
@ -269,15 +280,14 @@ class BlobDBImpl : public BlobDB {
|
||||||
#endif // !NDEBUG
|
#endif // !NDEBUG
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
class GarbageCollectionWriteCallback;
|
||||||
|
|
||||||
Status OpenPhase1();
|
Status OpenPhase1();
|
||||||
|
|
||||||
// Create a snapshot if there isn't one in read options.
|
// Create a snapshot if there isn't one in read options.
|
||||||
// Return true if a snapshot is created.
|
// Return true if a snapshot is created.
|
||||||
bool SetSnapshotIfNeeded(ReadOptions* read_options);
|
bool SetSnapshotIfNeeded(ReadOptions* read_options);
|
||||||
|
|
||||||
Status CommonGet(const Slice& key, const std::string& index_entry,
|
|
||||||
std::string* value);
|
|
||||||
|
|
||||||
Slice GetCompressedSlice(const Slice& raw,
|
Slice GetCompressedSlice(const Slice& raw,
|
||||||
std::string* compression_output) const;
|
std::string* compression_output) const;
|
||||||
|
|
||||||
|
@ -416,10 +426,6 @@ class BlobDBImpl : public BlobDB {
|
||||||
Env* env_;
|
Env* env_;
|
||||||
TTLExtractor* ttl_extractor_;
|
TTLExtractor* ttl_extractor_;
|
||||||
|
|
||||||
// Optimistic Transaction DB used during Garbage collection
|
|
||||||
// for atomicity
|
|
||||||
std::unique_ptr<OptimisticTransactionDBImpl> opt_db_;
|
|
||||||
|
|
||||||
// a boolean to capture whether write_options has been set
|
// a boolean to capture whether write_options has been set
|
||||||
std::atomic<bool> wo_set_;
|
std::atomic<bool> wo_set_;
|
||||||
WriteOptions write_options_;
|
WriteOptions write_options_;
|
||||||
|
@ -527,55 +533,6 @@ class BlobDBImpl : public BlobDB {
|
||||||
uint32_t debug_level_;
|
uint32_t debug_level_;
|
||||||
};
|
};
|
||||||
|
|
||||||
class BlobDBIterator : public Iterator {
|
|
||||||
public:
|
|
||||||
explicit BlobDBIterator(Iterator* iter, BlobDBImpl* impl, bool own_snapshot,
|
|
||||||
const Snapshot* snapshot)
|
|
||||||
: iter_(iter),
|
|
||||||
db_impl_(impl),
|
|
||||||
own_snapshot_(own_snapshot),
|
|
||||||
snapshot_(snapshot) {
|
|
||||||
assert(iter != nullptr);
|
|
||||||
assert(snapshot != nullptr);
|
|
||||||
}
|
|
||||||
|
|
||||||
~BlobDBIterator() {
|
|
||||||
if (own_snapshot_) {
|
|
||||||
db_impl_->ReleaseSnapshot(snapshot_);
|
|
||||||
}
|
|
||||||
delete iter_;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool Valid() const override { return iter_->Valid(); }
|
|
||||||
|
|
||||||
void SeekToFirst() override { iter_->SeekToFirst(); }
|
|
||||||
|
|
||||||
void SeekToLast() override { iter_->SeekToLast(); }
|
|
||||||
|
|
||||||
void Seek(const Slice& target) override { iter_->Seek(target); }
|
|
||||||
|
|
||||||
void SeekForPrev(const Slice& target) override { iter_->SeekForPrev(target); }
|
|
||||||
|
|
||||||
void Next() override { iter_->Next(); }
|
|
||||||
|
|
||||||
void Prev() override { iter_->Prev(); }
|
|
||||||
|
|
||||||
Slice key() const override { return iter_->key(); }
|
|
||||||
|
|
||||||
Slice value() const override;
|
|
||||||
|
|
||||||
Status status() const override { return iter_->status(); }
|
|
||||||
|
|
||||||
// Iterator::Refresh() not supported.
|
|
||||||
|
|
||||||
private:
|
|
||||||
Iterator* iter_;
|
|
||||||
BlobDBImpl* db_impl_;
|
|
||||||
bool own_snapshot_;
|
|
||||||
const Snapshot* snapshot_;
|
|
||||||
mutable std::string vpart_;
|
|
||||||
};
|
|
||||||
|
|
||||||
} // namespace blob_db
|
} // namespace blob_db
|
||||||
} // namespace rocksdb
|
} // namespace rocksdb
|
||||||
#endif // ROCKSDB_LITE
|
#endif // ROCKSDB_LITE
|
||||||
|
|
104
utilities/blob_db/blob_db_iterator.h
Normal file
104
utilities/blob_db/blob_db_iterator.h
Normal file
|
@ -0,0 +1,104 @@
|
||||||
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
||||||
|
// This source code is licensed under both the GPLv2 (found in the
|
||||||
|
// COPYING file in the root directory) and Apache 2.0 License
|
||||||
|
// (found in the LICENSE.Apache file in the root directory).
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
#ifndef ROCKSDB_LITE
|
||||||
|
|
||||||
|
#include "rocksdb/iterator.h"
|
||||||
|
#include "utilities/blob_db/blob_db_impl.h"
|
||||||
|
|
||||||
|
namespace rocksdb {
|
||||||
|
namespace blob_db {
|
||||||
|
|
||||||
|
using rocksdb::ManagedSnapshot;
|
||||||
|
|
||||||
|
class BlobDBIterator : public Iterator {
|
||||||
|
public:
|
||||||
|
BlobDBIterator(ManagedSnapshot* snapshot, ArenaWrappedDBIter* iter,
|
||||||
|
BlobDBImpl* blob_db)
|
||||||
|
: snapshot_(snapshot), iter_(iter), blob_db_(blob_db) {}
|
||||||
|
|
||||||
|
virtual ~BlobDBIterator() = default;
|
||||||
|
|
||||||
|
bool Valid() const override {
|
||||||
|
if (!iter_->Valid()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return status_.ok();
|
||||||
|
}
|
||||||
|
|
||||||
|
Status status() const override {
|
||||||
|
if (!iter_->status().ok()) {
|
||||||
|
return iter_->status();
|
||||||
|
}
|
||||||
|
return status_;
|
||||||
|
}
|
||||||
|
|
||||||
|
void SeekToFirst() override {
|
||||||
|
iter_->SeekToFirst();
|
||||||
|
UpdateBlobValue();
|
||||||
|
}
|
||||||
|
|
||||||
|
void SeekToLast() override {
|
||||||
|
iter_->SeekToLast();
|
||||||
|
UpdateBlobValue();
|
||||||
|
}
|
||||||
|
|
||||||
|
void Seek(const Slice& target) override {
|
||||||
|
iter_->Seek(target);
|
||||||
|
UpdateBlobValue();
|
||||||
|
}
|
||||||
|
|
||||||
|
void SeekForPrev(const Slice& target) override {
|
||||||
|
iter_->SeekForPrev(target);
|
||||||
|
UpdateBlobValue();
|
||||||
|
}
|
||||||
|
|
||||||
|
void Next() override {
|
||||||
|
assert(Valid());
|
||||||
|
iter_->Next();
|
||||||
|
UpdateBlobValue();
|
||||||
|
}
|
||||||
|
|
||||||
|
void Prev() override {
|
||||||
|
assert(Valid());
|
||||||
|
iter_->Prev();
|
||||||
|
UpdateBlobValue();
|
||||||
|
}
|
||||||
|
|
||||||
|
Slice key() const override {
|
||||||
|
assert(Valid());
|
||||||
|
return iter_->key();
|
||||||
|
}
|
||||||
|
|
||||||
|
Slice value() const override {
|
||||||
|
assert(Valid());
|
||||||
|
if (!iter_->IsBlob()) {
|
||||||
|
return iter_->value();
|
||||||
|
}
|
||||||
|
return value_;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Iterator::Refresh() not supported.
|
||||||
|
|
||||||
|
private:
|
||||||
|
void UpdateBlobValue() {
|
||||||
|
TEST_SYNC_POINT("BlobDBIterator::UpdateBlobValue:Start:1");
|
||||||
|
TEST_SYNC_POINT("BlobDBIterator::UpdateBlobValue:Start:2");
|
||||||
|
value_.Reset();
|
||||||
|
if (iter_->Valid() && iter_->IsBlob()) {
|
||||||
|
status_ = blob_db_->GetBlobValue(iter_->key(), iter_->value(), &value_);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::unique_ptr<ManagedSnapshot> snapshot_;
|
||||||
|
std::unique_ptr<ArenaWrappedDBIter> iter_;
|
||||||
|
BlobDBImpl* blob_db_;
|
||||||
|
Status status_;
|
||||||
|
PinnableSlice value_;
|
||||||
|
};
|
||||||
|
} // namespace blob_db
|
||||||
|
} // namespace rocksdb
|
||||||
|
#endif // !ROCKSDB_LITE
|
|
@ -88,9 +88,14 @@ class BlobDBTest : public testing::Test {
|
||||||
|
|
||||||
void PutRandom(const std::string &key, Random *rnd,
|
void PutRandom(const std::string &key, Random *rnd,
|
||||||
std::map<std::string, std::string> *data = nullptr) {
|
std::map<std::string, std::string> *data = nullptr) {
|
||||||
|
PutRandom(blob_db_, key, rnd, data);
|
||||||
|
}
|
||||||
|
|
||||||
|
void PutRandom(DB *db, const std::string &key, Random *rnd,
|
||||||
|
std::map<std::string, std::string> *data = nullptr) {
|
||||||
int len = rnd->Next() % kMaxBlobSize + 1;
|
int len = rnd->Next() % kMaxBlobSize + 1;
|
||||||
std::string value = test::RandomHumanReadableString(rnd, len);
|
std::string value = test::RandomHumanReadableString(rnd, len);
|
||||||
ASSERT_OK(blob_db_->Put(WriteOptions(), Slice(key), Slice(value)));
|
ASSERT_OK(db->Put(WriteOptions(), Slice(key), Slice(value)));
|
||||||
if (data != nullptr) {
|
if (data != nullptr) {
|
||||||
(*data)[key] = value;
|
(*data)[key] = value;
|
||||||
}
|
}
|
||||||
|
@ -116,9 +121,12 @@ class BlobDBTest : public testing::Test {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Verify blob db contain expected data and nothing more.
|
// Verify blob db contain expected data and nothing more.
|
||||||
// TODO(yiwu): Verify blob files are consistent with data in LSM.
|
|
||||||
void VerifyDB(const std::map<std::string, std::string> &data) {
|
void VerifyDB(const std::map<std::string, std::string> &data) {
|
||||||
Iterator *iter = blob_db_->NewIterator(ReadOptions());
|
VerifyDB(blob_db_, data);
|
||||||
|
}
|
||||||
|
|
||||||
|
void VerifyDB(DB *db, const std::map<std::string, std::string> &data) {
|
||||||
|
Iterator *iter = db->NewIterator(ReadOptions());
|
||||||
iter->SeekToFirst();
|
iter->SeekToFirst();
|
||||||
for (auto &p : data) {
|
for (auto &p : data) {
|
||||||
ASSERT_TRUE(iter->Valid());
|
ASSERT_TRUE(iter->Valid());
|
||||||
|
@ -593,7 +601,7 @@ TEST_F(BlobDBTest, GCRelocateKeyWhileOverwriting) {
|
||||||
ASSERT_OK(blob_db_impl->TEST_CloseBlobFile(blob_files[0]));
|
ASSERT_OK(blob_db_impl->TEST_CloseBlobFile(blob_files[0]));
|
||||||
|
|
||||||
SyncPoint::GetInstance()->LoadDependency(
|
SyncPoint::GetInstance()->LoadDependency(
|
||||||
{{"BlobDBImpl::GCFileAndUpdateLSM:AfterGetForUpdate",
|
{{"BlobDBImpl::GCFileAndUpdateLSM:AfterGetFromBaseDB",
|
||||||
"BlobDBImpl::PutUntil:Start"},
|
"BlobDBImpl::PutUntil:Start"},
|
||||||
{"BlobDBImpl::PutUntil:Finish",
|
{"BlobDBImpl::PutUntil:Finish",
|
||||||
"BlobDBImpl::GCFileAndUpdateLSM:BeforeRelocate"}});
|
"BlobDBImpl::GCFileAndUpdateLSM:BeforeRelocate"}});
|
||||||
|
@ -630,7 +638,7 @@ TEST_F(BlobDBTest, GCExpiredKeyWhileOverwriting) {
|
||||||
mock_env_->set_now_micros(300 * 1000000);
|
mock_env_->set_now_micros(300 * 1000000);
|
||||||
|
|
||||||
SyncPoint::GetInstance()->LoadDependency(
|
SyncPoint::GetInstance()->LoadDependency(
|
||||||
{{"BlobDBImpl::GCFileAndUpdateLSM:AfterGetForUpdate",
|
{{"BlobDBImpl::GCFileAndUpdateLSM:AfterGetFromBaseDB",
|
||||||
"BlobDBImpl::PutUntil:Start"},
|
"BlobDBImpl::PutUntil:Start"},
|
||||||
{"BlobDBImpl::PutUntil:Finish",
|
{"BlobDBImpl::PutUntil:Finish",
|
||||||
"BlobDBImpl::GCFileAndUpdateLSM:BeforeDelete"}});
|
"BlobDBImpl::GCFileAndUpdateLSM:BeforeDelete"}});
|
||||||
|
@ -687,7 +695,7 @@ TEST_F(BlobDBTest, GCOldestSimpleBlobFileWhenOutOfSpace) {
|
||||||
|
|
||||||
TEST_F(BlobDBTest, ReadWhileGC) {
|
TEST_F(BlobDBTest, ReadWhileGC) {
|
||||||
// run the same test for Get(), MultiGet() and Iterator each.
|
// run the same test for Get(), MultiGet() and Iterator each.
|
||||||
for (int i = 0; i < 3; i++) {
|
for (int i = 0; i < 2; i++) {
|
||||||
BlobDBOptions bdb_options;
|
BlobDBOptions bdb_options;
|
||||||
bdb_options.disable_background_tasks = true;
|
bdb_options.disable_background_tasks = true;
|
||||||
Open(bdb_options);
|
Open(bdb_options);
|
||||||
|
@ -710,17 +718,10 @@ TEST_F(BlobDBTest, ReadWhileGC) {
|
||||||
break;
|
break;
|
||||||
case 1:
|
case 1:
|
||||||
SyncPoint::GetInstance()->LoadDependency(
|
SyncPoint::GetInstance()->LoadDependency(
|
||||||
{{"BlobDBImpl::MultiGet:AfterIndexEntryGet:1",
|
{{"BlobDBIterator::UpdateBlobValue:Start:1",
|
||||||
"BlobDBTest::ReadWhileGC:1"},
|
"BlobDBTest::ReadWhileGC:1"},
|
||||||
{"BlobDBTest::ReadWhileGC:2",
|
{"BlobDBTest::ReadWhileGC:2",
|
||||||
"BlobDBImpl::MultiGet:AfterIndexEntryGet:2"}});
|
"BlobDBIterator::UpdateBlobValue:Start:2"}});
|
||||||
break;
|
|
||||||
case 2:
|
|
||||||
SyncPoint::GetInstance()->LoadDependency(
|
|
||||||
{{"BlobDBIterator::value:BeforeGetBlob:1",
|
|
||||||
"BlobDBTest::ReadWhileGC:1"},
|
|
||||||
{"BlobDBTest::ReadWhileGC:2",
|
|
||||||
"BlobDBIterator::value:BeforeGetBlob:2"}});
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
SyncPoint::GetInstance()->EnableProcessing();
|
SyncPoint::GetInstance()->EnableProcessing();
|
||||||
|
@ -735,12 +736,6 @@ TEST_F(BlobDBTest, ReadWhileGC) {
|
||||||
ASSERT_EQ("bar", value);
|
ASSERT_EQ("bar", value);
|
||||||
break;
|
break;
|
||||||
case 1:
|
case 1:
|
||||||
statuses = blob_db_->MultiGet(ReadOptions(), {"foo"}, &values);
|
|
||||||
ASSERT_EQ(1, statuses.size());
|
|
||||||
ASSERT_EQ(1, values.size());
|
|
||||||
ASSERT_EQ("bar", values[0]);
|
|
||||||
break;
|
|
||||||
case 2:
|
|
||||||
// VerifyDB use iterator to scan the DB.
|
// VerifyDB use iterator to scan the DB.
|
||||||
VerifyDB({{"foo", "bar"}});
|
VerifyDB({{"foo", "bar"}});
|
||||||
break;
|
break;
|
||||||
|
@ -834,6 +829,58 @@ TEST_F(BlobDBTest, GetLiveFilesMetaData) {
|
||||||
VerifyDB(data);
|
VerifyDB(data);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST_F(BlobDBTest, MigrateFromPlainRocksDB) {
|
||||||
|
constexpr size_t kNumKey = 20;
|
||||||
|
constexpr size_t kNumIteration = 10;
|
||||||
|
Random rnd(301);
|
||||||
|
std::map<std::string, std::string> data;
|
||||||
|
std::vector<bool> is_blob(kNumKey, false);
|
||||||
|
|
||||||
|
// Write to plain rocksdb.
|
||||||
|
Options options;
|
||||||
|
options.create_if_missing = true;
|
||||||
|
DB *db = nullptr;
|
||||||
|
ASSERT_OK(DB::Open(options, dbname_, &db));
|
||||||
|
for (size_t i = 0; i < kNumIteration; i++) {
|
||||||
|
auto key_index = rnd.Next() % kNumKey;
|
||||||
|
std::string key = "key" + ToString(key_index);
|
||||||
|
PutRandom(db, key, &rnd, &data);
|
||||||
|
}
|
||||||
|
VerifyDB(db, data);
|
||||||
|
delete db;
|
||||||
|
db = nullptr;
|
||||||
|
|
||||||
|
// Open as blob db. Verify it can read existing data.
|
||||||
|
Open();
|
||||||
|
VerifyDB(blob_db_, data);
|
||||||
|
for (size_t i = 0; i < kNumIteration; i++) {
|
||||||
|
auto key_index = rnd.Next() % kNumKey;
|
||||||
|
std::string key = "key" + ToString(key_index);
|
||||||
|
is_blob[key_index] = true;
|
||||||
|
PutRandom(blob_db_, key, &rnd, &data);
|
||||||
|
}
|
||||||
|
VerifyDB(blob_db_, data);
|
||||||
|
delete blob_db_;
|
||||||
|
blob_db_ = nullptr;
|
||||||
|
|
||||||
|
// Verify plain db return error for keys written by blob db.
|
||||||
|
ASSERT_OK(DB::Open(options, dbname_, &db));
|
||||||
|
std::string value;
|
||||||
|
for (size_t i = 0; i < kNumKey; i++) {
|
||||||
|
std::string key = "key" + ToString(i);
|
||||||
|
Status s = db->Get(ReadOptions(), key, &value);
|
||||||
|
if (data.count(key) == 0) {
|
||||||
|
ASSERT_TRUE(s.IsNotFound());
|
||||||
|
} else if (is_blob[i]) {
|
||||||
|
ASSERT_TRUE(s.IsNotSupported());
|
||||||
|
} else {
|
||||||
|
ASSERT_OK(s);
|
||||||
|
ASSERT_EQ(data[key], value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
delete db;
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace blob_db
|
} // namespace blob_db
|
||||||
} // namespace rocksdb
|
} // namespace rocksdb
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue