rocksdb/table/cuckoo/cuckoo_table_reader.cc
Andrew Kryczka bf98dcf9a8 Fix kBlockCacheTier read when merge-chain base value is in a blob file (#12462)
Summary:
The original goal is to propagate failures from `GetContext::SaveValue()` -> `GetContext::GetBlobValue()` -> `BlobFetcher::FetchBlob()` up to the user. This call sequence happens when a merge chain ends with a base value in a blob file.

There's also fixes for bugs encountered along the way where non-ok statuses were ignored/overwritten, and a bit of plumbing work for functions that had no capability to return a status.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/12462

Test Plan:
A repro command

```
db=/dev/shm/dbstress_db ; exp=/dev/shm/dbstress_exp ; rm -rf $db $exp ; mkdir -p $db $exp
./db_stress \
        --clear_column_family_one_in=0 \
        --test_batches_snapshots=0 \
        --write_fault_one_in=0 \
        --use_put_entity_one_in=0 \
        --prefixpercent=0 \
        --read_fault_one_in=0 \
        --readpercent=0 \
        --reopen=0 \
        --set_options_one_in=10000 \
        --delpercent=0 \
        --delrangepercent=0 \
        --open_metadata_write_fault_one_in=0 \
        --open_read_fault_one_in=0 \
        --open_write_fault_one_in=0 \
        --destroy_db_initially=0 \
        --ingest_external_file_one_in=0 \
        --iterpercent=0 \
        --nooverwritepercent=0 \
        --db=$db \
        --enable_blob_files=1 \
        --expected_values_dir=$exp \
        --max_background_compactions=20 \
        --max_bytes_for_level_base=2097152 \
        --max_key=100000 \
        --min_blob_size=0 \
        --open_files=-1 \
        --ops_per_thread=100000000 \
        --prefix_size=-1 \
        --target_file_size_base=524288 \
        --use_merge=1 \
        --value_size_mult=32 \
        --write_buffer_size=524288 \
        --writepercent=100
```

It used to fail like:

```
...
frame https://github.com/facebook/rocksdb/issues/9: 0x00007fc63903bc93 libc.so.6`__GI___assert_fail(assertion="HasDefaultColumn(columns)", file="fbcode/internal_repo_rocksdb/repo/db/wide/wide_columns_helper.h", line=33, function="static const rocksdb::Slice &rocksdb::WideColumnsHelper::GetDefaultColumn(const rocksdb::WideColumns &)") at assert.c:101:3
frame https://github.com/facebook/rocksdb/issues/10: 0x00000000006f7e92 db_stress`rocksdb::Version::Get(rocksdb::ReadOptions const&, rocksdb::LookupKey const&, rocksdb::PinnableSlice*, rocksdb::PinnableWideColumns*, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char>>*, rocksdb::Status*, rocksdb::MergeContext*, unsigned long*, rocksdb::PinnedIteratorsManager*, bool*, bool*, unsigned long*, rocksdb::ReadCallback*, bool*, bool) [inlined] rocksdb::WideColumnsHelper::GetDefaultColumn(columns=size=0) at wide_columns_helper.h:33
frame https://github.com/facebook/rocksdb/issues/11: 0x00000000006f7e76 db_stress`rocksdb::Version::Get(this=0x00007fc5ec763000, read_options=<unavailable>, k=<unavailable>, value=0x0000000000000000, columns=0x00007fc6035fd1d8, timestamp=<unavailable>, status=0x00007fc6035fd250, merge_context=0x00007fc6035fce40, max_covering_tombstone_seq=0x00007fc6035fce90, pinned_iters_mgr=0x00007fc6035fcdf0, value_found=0x0000000000000000, key_exists=0x0000000000000000, seq=0x0000000000000000, callback=0x0000000000000000, is_blob=0x0000000000000000, do_merge=<unavailable>) at version_set.cc:2492
frame https://github.com/facebook/rocksdb/issues/12: 0x000000000051e245 db_stress`rocksdb::DBImpl::GetImpl(this=0x00007fc637a86000, read_options=0x00007fc6035fcf60, key=<unavailable>, get_impl_options=0x00007fc6035fd000) at db_impl.cc:2408
frame https://github.com/facebook/rocksdb/issues/13: 0x000000000050cec2 db_stress`rocksdb::DBImpl::GetEntity(this=0x00007fc637a86000, _read_options=<unavailable>, column_family=<unavailable>, key=0x00007fc6035fd3c8, columns=0x00007fc6035fd1d8) at db_impl.cc:2109
frame https://github.com/facebook/rocksdb/issues/14: 0x000000000074f688 db_stress`rocksdb::(anonymous namespace)::MemTableInserter::MergeCF(this=0x00007fc6035fd450, column_family_id=2, key=0x00007fc6035fd3c8, value=0x00007fc6035fd3a0) at write_batch.cc:2656
frame https://github.com/facebook/rocksdb/issues/15: 0x00000000007476fc db_stress`rocksdb::WriteBatchInternal::Iterate(wb=0x00007fc6035fe698, handler=0x00007fc6035fd450, begin=12, end=<unavailable>) at write_batch.cc:607
frame https://github.com/facebook/rocksdb/issues/16: 0x000000000074d7dd db_stress`rocksdb::WriteBatchInternal::InsertInto(rocksdb::WriteThread::WriteGroup&, unsigned long, rocksdb::ColumnFamilyMemTables*, rocksdb::FlushScheduler*, rocksdb::TrimHistoryScheduler*, bool, unsigned long, rocksdb::DB*, bool, bool, bool) [inlined] rocksdb::WriteBatch::Iterate(this=<unavailable>, handler=0x00007fc6035fd450) const at write_batch.cc:505
frame https://github.com/facebook/rocksdb/issues/17: 0x000000000074d77b db_stress`rocksdb::WriteBatchInternal::InsertInto(write_group=<unavailable>, sequence=<unavailable>, memtables=<unavailable>, flush_scheduler=<unavailable>, trim_history_scheduler=<unavailable>, ignore_missing_column_families=<unavailable>, recovery_log_number=0, db=0x00007fc637a86000, concurrent_memtable_writes=<unavailable>, seq_per_batch=false, batch_per_txn=<unavailable>) at write_batch.cc:3084
frame https://github.com/facebook/rocksdb/issues/18: 0x0000000000631d77 db_stress`rocksdb::DBImpl::PipelinedWriteImpl(this=0x00007fc637a86000, write_options=<unavailable>, my_batch=0x00007fc6035fe698, callback=0x0000000000000000, log_used=<unavailable>, log_ref=0, disable_memtable=<unavailable>, seq_used=0x0000000000000000) at db_impl_write.cc:807
frame https://github.com/facebook/rocksdb/issues/19: 0x000000000062ceeb db_stress`rocksdb::DBImpl::WriteImpl(this=<unavailable>, write_options=<unavailable>, my_batch=0x00007fc6035fe698, callback=0x0000000000000000, log_used=<unavailable>, log_ref=0, disable_memtable=<unavailable>, seq_used=0x0000000000000000, batch_cnt=0, pre_release_callback=0x0000000000000000, post_memtable_callback=0x0000000000000000) at db_impl_write.cc:312
frame https://github.com/facebook/rocksdb/issues/20: 0x000000000062c8ec db_stress`rocksdb::DBImpl::Write(this=0x00007fc637a86000, write_options=0x00007fc6035feca8, my_batch=0x00007fc6035fe698) at db_impl_write.cc:157
frame https://github.com/facebook/rocksdb/issues/21: 0x000000000062b847 db_stress`rocksdb::DB::Merge(this=0x00007fc637a86000, opt=0x00007fc6035feca8, column_family=0x00007fc6370bf140, key=0x00007fc6035fe8d8, value=0x00007fc6035fe830) at db_impl_write.cc:2544
frame https://github.com/facebook/rocksdb/issues/22: 0x000000000062b6ef db_stress`rocksdb::DBImpl::Merge(this=0x00007fc637a86000, o=<unavailable>, column_family=0x00007fc6370bf140, key=0x00007fc6035fe8d8, val=0x00007fc6035fe830) at db_impl_write.cc:72
frame https://github.com/facebook/rocksdb/issues/23: 0x00000000004d6397 db_stress`rocksdb::NonBatchedOpsStressTest::TestPut(this=0x00007fc637041000, thread=0x00007fc6370dbc00, write_opts=0x00007fc6035feca8, read_opts=0x00007fc6035fe9c8, rand_column_families=<unavailable>, rand_keys=size=1, value={P\xe9_\x03\xc6\x7f\0\0}) at no_batched_ops_stress.cc:1317
frame https://github.com/facebook/rocksdb/issues/24: 0x000000000049361d db_stress`rocksdb::StressTest::OperateDb(this=0x00007fc637041000, thread=0x00007fc6370dbc00) at db_stress_test_base.cc:1148
...
```

Reviewed By: ltamasi

Differential Revision: D55157795

Pulled By: ajkr

fbshipit-source-id: 5f7c1380ead5794c29d41680028e34b839744764
2024-03-21 12:38:53 -07:00

417 lines
14 KiB
C++

// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "table/cuckoo/cuckoo_table_reader.h"
#include <algorithm>
#include <limits>
#include <string>
#include <utility>
#include <vector>
#include "memory/arena.h"
#include "options/cf_options.h"
#include "rocksdb/iterator.h"
#include "rocksdb/table.h"
#include "table/cuckoo/cuckoo_table_factory.h"
#include "table/get_context.h"
#include "table/internal_iterator.h"
#include "table/meta_blocks.h"
#include "util/coding.h"
namespace ROCKSDB_NAMESPACE {
namespace {
const uint64_t CACHE_LINE_MASK = ~((uint64_t)CACHE_LINE_SIZE - 1);
const uint32_t kInvalidIndex = std::numeric_limits<uint32_t>::max();
} // namespace
extern const uint64_t kCuckooTableMagicNumber;
CuckooTableReader::CuckooTableReader(
const ImmutableOptions& ioptions,
std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
const Comparator* comparator,
uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t))
: file_(std::move(file)),
is_last_level_(false),
identity_as_first_hash_(false),
use_module_hash_(false),
num_hash_func_(0),
key_length_(0),
user_key_length_(0),
value_length_(0),
bucket_length_(0),
cuckoo_block_size_(0),
cuckoo_block_bytes_minus_one_(0),
table_size_(0),
ucomp_(comparator),
get_slice_hash_(get_slice_hash) {
if (!ioptions.allow_mmap_reads) {
status_ = Status::InvalidArgument("File is not mmaped");
return;
}
{
std::unique_ptr<TableProperties> props;
// TODO: plumb Env::IOActivity, Env::IOPriority
const ReadOptions read_options;
status_ =
ReadTableProperties(file_.get(), file_size, kCuckooTableMagicNumber,
ioptions, read_options, &props);
if (!status_.ok()) {
return;
}
table_props_ = std::move(props);
}
auto& user_props = table_props_->user_collected_properties;
auto hash_funs = user_props.find(CuckooTablePropertyNames::kNumHashFunc);
if (hash_funs == user_props.end()) {
status_ = Status::Corruption("Number of hash functions not found");
return;
}
num_hash_func_ = *reinterpret_cast<const uint32_t*>(hash_funs->second.data());
auto unused_key = user_props.find(CuckooTablePropertyNames::kEmptyKey);
if (unused_key == user_props.end()) {
status_ = Status::Corruption("Empty bucket value not found");
return;
}
unused_key_ = unused_key->second;
key_length_ = static_cast<uint32_t>(table_props_->fixed_key_len);
auto user_key_len = user_props.find(CuckooTablePropertyNames::kUserKeyLength);
if (user_key_len == user_props.end()) {
status_ = Status::Corruption("User key length not found");
return;
}
user_key_length_ =
*reinterpret_cast<const uint32_t*>(user_key_len->second.data());
auto value_length = user_props.find(CuckooTablePropertyNames::kValueLength);
if (value_length == user_props.end()) {
status_ = Status::Corruption("Value length not found");
return;
}
value_length_ =
*reinterpret_cast<const uint32_t*>(value_length->second.data());
bucket_length_ = key_length_ + value_length_;
auto hash_table_size =
user_props.find(CuckooTablePropertyNames::kHashTableSize);
if (hash_table_size == user_props.end()) {
status_ = Status::Corruption("Hash table size not found");
return;
}
table_size_ =
*reinterpret_cast<const uint64_t*>(hash_table_size->second.data());
auto is_last_level = user_props.find(CuckooTablePropertyNames::kIsLastLevel);
if (is_last_level == user_props.end()) {
status_ = Status::Corruption("Is last level not found");
return;
}
is_last_level_ = *reinterpret_cast<const bool*>(is_last_level->second.data());
auto identity_as_first_hash =
user_props.find(CuckooTablePropertyNames::kIdentityAsFirstHash);
if (identity_as_first_hash == user_props.end()) {
status_ = Status::Corruption("identity as first hash not found");
return;
}
identity_as_first_hash_ =
*reinterpret_cast<const bool*>(identity_as_first_hash->second.data());
auto use_module_hash =
user_props.find(CuckooTablePropertyNames::kUseModuleHash);
if (use_module_hash == user_props.end()) {
status_ = Status::Corruption("hash type is not found");
return;
}
use_module_hash_ =
*reinterpret_cast<const bool*>(use_module_hash->second.data());
auto cuckoo_block_size =
user_props.find(CuckooTablePropertyNames::kCuckooBlockSize);
if (cuckoo_block_size == user_props.end()) {
status_ = Status::Corruption("Cuckoo block size not found");
return;
}
cuckoo_block_size_ =
*reinterpret_cast<const uint32_t*>(cuckoo_block_size->second.data());
cuckoo_block_bytes_minus_one_ = cuckoo_block_size_ * bucket_length_ - 1;
// TODO: rate limit reads of whole cuckoo tables.
status_ = file_->Read(IOOptions(), 0, static_cast<size_t>(file_size),
&file_data_, nullptr, nullptr);
}
Status CuckooTableReader::Get(const ReadOptions& /*readOptions*/,
const Slice& key, GetContext* get_context,
const SliceTransform* /* prefix_extractor */,
bool /*skip_filters*/) {
assert(key.size() == key_length_ + (is_last_level_ ? 8 : 0));
Slice user_key = ExtractUserKey(key);
for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_; ++hash_cnt) {
uint64_t offset =
bucket_length_ * CuckooHash(user_key, hash_cnt, use_module_hash_,
table_size_, identity_as_first_hash_,
get_slice_hash_);
const char* bucket = &file_data_.data()[offset];
for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_;
++block_idx, bucket += bucket_length_) {
if (ucomp_->Equal(Slice(unused_key_.data(), user_key.size()),
Slice(bucket, user_key.size()))) {
return Status::OK();
}
// Here, we compare only the user key part as we support only one entry
// per user key and we don't support snapshot.
if (ucomp_->Equal(user_key, Slice(bucket, user_key.size()))) {
Slice value(bucket + key_length_, value_length_);
if (is_last_level_) {
// Sequence number is not stored at the last level, so we will use
// kMaxSequenceNumber since it is unknown. This could cause some
// transactions to fail to lock a key due to known sequence number.
// However, it is expected for anyone to use a CuckooTable in a
// TransactionDB.
get_context->SaveValue(value, kMaxSequenceNumber);
} else {
Slice full_key(bucket, key_length_);
ParsedInternalKey found_ikey;
Status s = ParseInternalKey(full_key, &found_ikey,
false /* log_err_key */); // TODO
if (!s.ok()) {
return s;
}
bool dont_care __attribute__((__unused__));
get_context->SaveValue(found_ikey, value, &dont_care, &s);
if (!s.ok()) {
return s;
}
}
// We don't support merge operations. So, we return here.
return Status::OK();
}
}
}
return Status::OK();
}
void CuckooTableReader::Prepare(const Slice& key) {
// Prefetch the first Cuckoo Block.
Slice user_key = ExtractUserKey(key);
uint64_t addr =
reinterpret_cast<uint64_t>(file_data_.data()) +
bucket_length_ * CuckooHash(user_key, 0, use_module_hash_, table_size_,
identity_as_first_hash_, nullptr);
uint64_t end_addr = addr + cuckoo_block_bytes_minus_one_;
for (addr &= CACHE_LINE_MASK; addr < end_addr; addr += CACHE_LINE_SIZE) {
PREFETCH(reinterpret_cast<const char*>(addr), 0, 3);
}
}
class CuckooTableIterator : public InternalIterator {
public:
explicit CuckooTableIterator(CuckooTableReader* reader);
// No copying allowed
CuckooTableIterator(const CuckooTableIterator&) = delete;
void operator=(const Iterator&) = delete;
~CuckooTableIterator() override = default;
bool Valid() const override;
void SeekToFirst() override;
void SeekToLast() override;
void Seek(const Slice& target) override;
void SeekForPrev(const Slice& target) override;
void Next() override;
void Prev() override;
Slice key() const override;
Slice value() const override;
Status status() const override { return Status::OK(); }
void InitIfNeeded();
private:
struct BucketComparator {
BucketComparator(const Slice& file_data, const Comparator* ucomp,
uint32_t bucket_len, uint32_t user_key_len,
const Slice& target = Slice())
: file_data_(file_data),
ucomp_(ucomp),
bucket_len_(bucket_len),
user_key_len_(user_key_len),
target_(target) {}
bool operator()(const uint32_t first, const uint32_t second) const {
const char* first_bucket = (first == kInvalidIndex)
? target_.data()
: &file_data_.data()[first * bucket_len_];
const char* second_bucket =
(second == kInvalidIndex) ? target_.data()
: &file_data_.data()[second * bucket_len_];
return ucomp_->Compare(Slice(first_bucket, user_key_len_),
Slice(second_bucket, user_key_len_)) < 0;
}
private:
const Slice file_data_;
const Comparator* ucomp_;
const uint32_t bucket_len_;
const uint32_t user_key_len_;
const Slice target_;
};
const BucketComparator bucket_comparator_;
void PrepareKVAtCurrIdx();
CuckooTableReader* reader_;
bool initialized_;
// Contains a map of keys to bucket_id sorted in key order.
std::vector<uint32_t> sorted_bucket_ids_;
// We assume that the number of items can be stored in uint32 (4 Billion).
uint32_t curr_key_idx_;
Slice curr_value_;
IterKey curr_key_;
};
CuckooTableIterator::CuckooTableIterator(CuckooTableReader* reader)
: bucket_comparator_(reader->file_data_, reader->ucomp_,
reader->bucket_length_, reader->user_key_length_),
reader_(reader),
initialized_(false),
curr_key_idx_(kInvalidIndex) {
sorted_bucket_ids_.clear();
curr_value_.clear();
curr_key_.Clear();
}
void CuckooTableIterator::InitIfNeeded() {
if (initialized_) {
return;
}
sorted_bucket_ids_.reserve(
static_cast<size_t>(reader_->GetTableProperties()->num_entries));
uint64_t num_buckets = reader_->table_size_ + reader_->cuckoo_block_size_ - 1;
assert(num_buckets < kInvalidIndex);
const char* bucket = reader_->file_data_.data();
for (uint32_t bucket_id = 0; bucket_id < num_buckets; ++bucket_id) {
if (Slice(bucket, reader_->key_length_) != Slice(reader_->unused_key_)) {
sorted_bucket_ids_.push_back(bucket_id);
}
bucket += reader_->bucket_length_;
}
assert(sorted_bucket_ids_.size() ==
reader_->GetTableProperties()->num_entries);
std::sort(sorted_bucket_ids_.begin(), sorted_bucket_ids_.end(),
bucket_comparator_);
curr_key_idx_ = kInvalidIndex;
initialized_ = true;
}
void CuckooTableIterator::SeekToFirst() {
InitIfNeeded();
curr_key_idx_ = 0;
PrepareKVAtCurrIdx();
}
void CuckooTableIterator::SeekToLast() {
InitIfNeeded();
curr_key_idx_ = static_cast<uint32_t>(sorted_bucket_ids_.size()) - 1;
PrepareKVAtCurrIdx();
}
void CuckooTableIterator::Seek(const Slice& target) {
InitIfNeeded();
const BucketComparator seek_comparator(
reader_->file_data_, reader_->ucomp_, reader_->bucket_length_,
reader_->user_key_length_, ExtractUserKey(target));
auto seek_it =
std::lower_bound(sorted_bucket_ids_.begin(), sorted_bucket_ids_.end(),
kInvalidIndex, seek_comparator);
curr_key_idx_ =
static_cast<uint32_t>(std::distance(sorted_bucket_ids_.begin(), seek_it));
PrepareKVAtCurrIdx();
}
void CuckooTableIterator::SeekForPrev(const Slice& /*target*/) {
// Not supported
assert(false);
}
bool CuckooTableIterator::Valid() const {
return curr_key_idx_ < sorted_bucket_ids_.size();
}
void CuckooTableIterator::PrepareKVAtCurrIdx() {
if (!Valid()) {
curr_value_.clear();
curr_key_.Clear();
return;
}
uint32_t id = sorted_bucket_ids_[curr_key_idx_];
const char* offset =
reader_->file_data_.data() + id * reader_->bucket_length_;
if (reader_->is_last_level_) {
// Always return internal key.
curr_key_.SetInternalKey(Slice(offset, reader_->user_key_length_), 0,
kTypeValue);
} else {
curr_key_.SetInternalKey(Slice(offset, reader_->key_length_));
}
curr_value_ = Slice(offset + reader_->key_length_, reader_->value_length_);
}
void CuckooTableIterator::Next() {
if (!Valid()) {
curr_value_.clear();
curr_key_.Clear();
return;
}
++curr_key_idx_;
PrepareKVAtCurrIdx();
}
void CuckooTableIterator::Prev() {
if (curr_key_idx_ == 0) {
curr_key_idx_ = static_cast<uint32_t>(sorted_bucket_ids_.size());
}
if (!Valid()) {
curr_value_.clear();
curr_key_.Clear();
return;
}
--curr_key_idx_;
PrepareKVAtCurrIdx();
}
Slice CuckooTableIterator::key() const {
assert(Valid());
return curr_key_.GetInternalKey();
}
Slice CuckooTableIterator::value() const {
assert(Valid());
return curr_value_;
}
InternalIterator* CuckooTableReader::NewIterator(
const ReadOptions& /*read_options*/,
const SliceTransform* /* prefix_extractor */, Arena* arena,
bool /*skip_filters*/, TableReaderCaller /*caller*/,
size_t /*compaction_readahead_size*/, bool /* allow_unprepared_value */) {
if (!status().ok()) {
return NewErrorInternalIterator<Slice>(
Status::Corruption("CuckooTableReader status is not okay."), arena);
}
CuckooTableIterator* iter;
if (arena == nullptr) {
iter = new CuckooTableIterator(this);
} else {
auto iter_mem = arena->AllocateAligned(sizeof(CuckooTableIterator));
iter = new (iter_mem) CuckooTableIterator(this);
}
return iter;
}
size_t CuckooTableReader::ApproximateMemoryUsage() const { return 0; }
} // namespace ROCKSDB_NAMESPACE