rocksdb/table/table_reader.h
Peter Dillinger 961468f92e Fix TSAN-reported data race with uncache_aggressiveness (#12753)
Summary:
Data race reported on
BlockBasedTableReader::Rep::uncache_aggressiveness because apparently a file can be marked obsolete through multiple table cache references in parallel. Using a relaxed atomic should resolve the race quite reasonably, especially considering this is a rare case and the racing writes should be storing the same value anyway.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/12753

Test Plan: watch for TSAN crash test results

Reviewed By: ltamasi

Differential Revision: D58397473

Pulled By: pdillinger

fbshipit-source-id: 3e78b6adac4f7a7056790754bee42b3cb244f037
2024-06-11 16:53:13 -07:00

203 lines
8.7 KiB
C++

// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#include <memory>
#include "db/range_tombstone_fragmenter.h"
#if USE_COROUTINES
#include "folly/experimental/coro/Coroutine.h"
#include "folly/experimental/coro/Task.h"
#endif
#include "rocksdb/slice_transform.h"
#include "rocksdb/table_reader_caller.h"
#include "table/get_context.h"
#include "table/internal_iterator.h"
#include "table/multiget_context.h"
namespace ROCKSDB_NAMESPACE {
class Iterator;
struct ParsedInternalKey;
class Slice;
class Arena;
struct ReadOptions;
struct TableProperties;
class GetContext;
class MultiGetContext;
// A Table (also referred to as SST) is a sorted map from strings to strings.
// Tables are immutable and persistent. A Table may be safely accessed from
// multiple threads without external synchronization. Table readers are used
// for reading various types of table formats supported by rocksdb including
// BlockBasedTable, PlainTable and CuckooTable format.
class TableReader {
public:
virtual ~TableReader() {}
// Returns a new iterator over the table contents.
// The result of NewIterator() is initially invalid (caller must
// call one of the Seek methods on the iterator before using it).
//
// read_options: Must outlive the returned iterator.
// arena: If not null, the arena needs to be used to allocate the Iterator.
// When destroying the iterator, the caller will not call "delete"
// but Iterator::~Iterator() directly. The destructor needs to destroy
// all the states but those allocated in arena.
// skip_filters: disables checking the bloom filters even if they exist. This
// option is effective only for block-based table format.
// compaction_readahead_size: its value will only be used if caller =
// kCompaction
virtual InternalIterator* NewIterator(
const ReadOptions& read_options, const SliceTransform* prefix_extractor,
Arena* arena, bool skip_filters, TableReaderCaller caller,
size_t compaction_readahead_size = 0,
bool allow_unprepared_value = false) = 0;
// read_options.snapshot needs to outlive this call.
virtual FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator(
const ReadOptions& /*read_options*/) {
return nullptr;
}
virtual FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator(
SequenceNumber /* read_seqno */, const Slice* /* timestamp */) {
return nullptr;
}
// Given a key, return an approximate byte offset in the file where
// the data for that key begins (or would begin if the key were
// present in the file). The returned value is in terms of file
// bytes, and so includes effects like compression of the underlying data.
// E.g., the approximate offset of the last key in the table will
// be close to the file length.
// TODO(peterd): Since this function is only used for approximate size
// from beginning of file, reduce code duplication by removing this
// function and letting ApproximateSize take optional start and end, so
// that absolute start and end can be specified and optimized without
// key / index work.
virtual uint64_t ApproximateOffsetOf(const ReadOptions& read_options,
const Slice& key,
TableReaderCaller caller) = 0;
// Given start and end keys, return the approximate data size in the file
// between the keys. The returned value is in terms of file bytes, and so
// includes effects like compression of the underlying data and applicable
// portions of metadata including filters and indexes. Nullptr for start or
// end (or both) indicates absolute start or end of the table.
virtual uint64_t ApproximateSize(const ReadOptions& read_options,
const Slice& start, const Slice& end,
TableReaderCaller caller) = 0;
struct Anchor {
Anchor(const Slice& _user_key, size_t _range_size)
: user_key(_user_key.ToStringView()), range_size(_range_size) {}
std::string user_key;
size_t range_size;
};
// Now try to return approximately 128 anchor keys.
// The last one tends to be the largest key.
virtual Status ApproximateKeyAnchors(const ReadOptions& /*read_options*/,
std::vector<Anchor>& /*anchors*/) {
return Status::NotSupported("ApproximateKeyAnchors() not supported.");
}
// Set up the table for Compaction. Might change some parameters with
// posix_fadvise
virtual void SetupForCompaction() = 0;
virtual std::shared_ptr<const TableProperties> GetTableProperties() const = 0;
// Prepare work that can be done before the real Get()
virtual void Prepare(const Slice& /*target*/) {}
// Report an approximation of how much memory has been used.
virtual size_t ApproximateMemoryUsage() const = 0;
// Calls get_context->SaveValue() repeatedly, starting with
// the entry found after a call to Seek(key), until it returns false.
// May not make such a call if filter policy says that key is not present.
//
// get_context->MarkKeyMayExist needs to be called when it is configured to be
// memory only and the key is not found in the block cache.
//
// readOptions is the options for the read
// key is the key to search for
// skip_filters: disables checking the bloom filters even if they exist. This
// option is effective only for block-based table format.
virtual Status Get(const ReadOptions& readOptions, const Slice& key,
GetContext* get_context,
const SliceTransform* prefix_extractor,
bool skip_filters = false) = 0;
// Use bloom filters in the table file, if present, to filter out keys. The
// mget_range will be updated to skip keys that get a negative result from
// the filter lookup.
virtual Status MultiGetFilter(const ReadOptions& /*readOptions*/,
const SliceTransform* /*prefix_extractor*/,
MultiGetContext::Range* /*mget_range*/) {
return Status::NotSupported();
}
virtual void MultiGet(const ReadOptions& readOptions,
const MultiGetContext::Range* mget_range,
const SliceTransform* prefix_extractor,
bool skip_filters = false) {
for (auto iter = mget_range->begin(); iter != mget_range->end(); ++iter) {
*iter->s = Get(readOptions, iter->ikey, iter->get_context,
prefix_extractor, skip_filters);
}
}
#if USE_COROUTINES
virtual folly::coro::Task<void> MultiGetCoroutine(
const ReadOptions& readOptions, const MultiGetContext::Range* mget_range,
const SliceTransform* prefix_extractor, bool skip_filters = false) {
MultiGet(readOptions, mget_range, prefix_extractor, skip_filters);
co_return;
}
#endif // USE_COROUTINES
// Prefetch data corresponding to a give range of keys
// Typically this functionality is required for table implementations that
// persists the data on a non volatile storage medium like disk/SSD
virtual Status Prefetch(const ReadOptions& /* read_options */,
const Slice* begin = nullptr,
const Slice* end = nullptr) {
(void)begin;
(void)end;
// Default implementation is NOOP.
// The child class should implement functionality when applicable
return Status::OK();
}
// convert db file to a human readable form
virtual Status DumpTable(WritableFile* /*out_file*/) {
return Status::NotSupported("DumpTable() not supported");
}
// check whether there is corruption in this db file
virtual Status VerifyChecksum(const ReadOptions& /*read_options*/,
TableReaderCaller /*caller*/) {
return Status::NotSupported("VerifyChecksum() not supported");
}
// Tell the reader that the file should now be obsolete, e.g. as a hint
// to delete relevant cache entries on destruction. (It might not be safe
// to "unpin" cache entries until destruction time.) NOTE: must be thread
// safe because multiple table cache references might all mark this file as
// obsolete when they are released (the last of which destroys this reader).
virtual void MarkObsolete(uint32_t /*uncache_aggressiveness*/) {
// no-op as default
}
};
} // namespace ROCKSDB_NAMESPACE