rocksdb/db/table_cache.h

//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
//  This source code is licensed under both the GPLv2 (found in the
//  COPYING file in the root directory) and Apache 2.0 License
//  (found in the LICENSE.Apache file in the root directory).
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// Thread-safe (provides internal synchronization)

#pragma once
#include <cstdint>
#include <string>
#include <vector>

#include "db/dbformat.h"
#include "db/range_del_aggregator.h"
#include "options/cf_options.h"
#include "port/port.h"
#include "rocksdb/cache.h"
#include "rocksdb/env.h"
#include "rocksdb/options.h"
#include "rocksdb/table.h"
#include "table/table_reader.h"
#include "trace_replay/block_cache_tracer.h"
#include "util/coro_utils.h"

namespace ROCKSDB_NAMESPACE {

class Env;
class Arena;
struct FileDescriptor;
class GetContext;
class HistogramImpl;

// Manages caching for TableReader objects for a column family. The actual
// cache is allocated separately and passed to the constructor. TableCache
// wraps around the underlying SST file readers by providing Get(),
// MultiGet() and NewIterator() methods that hide the instantiation,
// caching and access to the TableReader. The main purpose of this is
// performance - by caching the TableReader, it avoids unnecessary file opens
// and object allocation and instantiation. One exception is compaction, where
// a new TableReader may be instantiated - see NewIterator() comments
//
// Another service provided by TableCache is managing the row cache - if the
// DB is configured with a row cache, and the lookup key is present in the row
// cache, lookup is very fast. The row cache is obtained from
// ioptions.row_cache
class TableCache {
 public:
  TableCache(const ImmutableOptions& ioptions,
             const FileOptions* storage_options, Cache* cache,
             BlockCacheTracer* const block_cache_tracer,
             const std::shared_ptr<IOTracer>& io_tracer,
             const std::string& db_session_id);
  ~TableCache();

  // Return an iterator for the specified file number (the corresponding
  // file length must be exactly "file_size" bytes).  If "table_reader_ptr"
  // is non-nullptr, also sets "*table_reader_ptr" to point to the Table object
  // underlying the returned iterator, or nullptr if no Table object underlies
  // the returned iterator.  The returned "*table_reader_ptr" object is owned
  // by the cache and should not be deleted, and is valid for as long as the
  // returned iterator is live.
  // If !options.ignore_range_deletions, and range_del_iter is non-nullptr,
  // then range_del_iter is set to a TruncatedRangeDelIterator for range
  // tombstones in the SST file corresponding to the specified file number. The
  // upper/lower bounds for the TruncatedRangeDelIterator are set to the SST
  // file's boundary.
  // @param options Must outlive the returned iterator.
  // @param range_del_agg If non-nullptr, adds range deletions to the
  //    aggregator. If an error occurs, returns it in a NewErrorInternalIterator
  // @param for_compaction If true, a new TableReader may be allocated (but
  //                       not cached), depending on the CF options
  // @param skip_filters Disables loading/accessing the filter block
  // @param level The level this table is at, -1 for "not set / don't know"
  InternalIterator* NewIterator(
      const ReadOptions& options, const FileOptions& toptions,
      const InternalKeyComparator& internal_comparator,
      const FileMetaData& file_meta, RangeDelAggregator* range_del_agg,
      const std::shared_ptr<const SliceTransform>& prefix_extractor,
      TableReader** table_reader_ptr, HistogramImpl* file_read_hist,
      TableReaderCaller caller, Arena* arena, bool skip_filters, int level,
      size_t max_file_size_for_l0_meta_pin,
      const InternalKey* smallest_compaction_key,
      const InternalKey* largest_compaction_key, bool allow_unprepared_value,
      TruncatedRangeDelIterator** range_del_iter = nullptr);

  // If a seek to internal key "k" in specified file finds an entry,
  // call get_context->SaveValue() repeatedly until
  // it returns false. As a side effect, it will insert the TableReader
  // into the cache and potentially evict another entry
  // @param get_context Context for get operation. The result of the lookup
  //                    can be retrieved by calling get_context->State()
  // @param file_read_hist If non-nullptr, the file reader statistics are
  //                       recorded
  // @param skip_filters Disables loading/accessing the filter block
  // @param level The level this table is at, -1 for "not set / don't know"
  Status Get(
      const ReadOptions& options,
      const InternalKeyComparator& internal_comparator,
      const FileMetaData& file_meta, const Slice& k, GetContext* get_context,
      const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr,
      HistogramImpl* file_read_hist = nullptr, bool skip_filters = false,
      int level = -1, size_t max_file_size_for_l0_meta_pin = 0);

  // Return the range delete tombstone iterator of the file specified by
  // `file_meta`.
  Status GetRangeTombstoneIterator(
      const ReadOptions& options,
      const InternalKeyComparator& internal_comparator,
      const FileMetaData& file_meta,
      std::unique_ptr<FragmentedRangeTombstoneIterator>* out_iter);

  // Call table reader's MultiGetFilter to use the bloom filter to filter out
  // keys. Returns Status::NotSupported() if row cache needs to be checked.
  // If the table cache is looked up to get the table reader, the cache handle
  // is returned in table_handle. This handle should be passed back to
  // MultiGet() so it can be released.
  Status MultiGetFilter(
      const ReadOptions& options,
      const InternalKeyComparator& internal_comparator,
      const FileMetaData& file_meta,
      const std::shared_ptr<const SliceTransform>& prefix_extractor,
      HistogramImpl* file_read_hist, int level,
      MultiGetContext::Range* mget_range, Cache::Handle** table_handle);

  // If a seek to internal key "k" in specified file finds an entry,
  // call get_context->SaveValue() repeatedly until
  // it returns false. As a side effect, it will insert the TableReader
  // into the cache and potentially evict another entry
  // @param mget_range Pointer to the structure describing a batch of keys to
  //                   be looked up in this table file. The result is stored
  //                   in the embedded GetContext
  // @param skip_filters Disables loading/accessing the filter block
  // @param level The level this table is at, -1 for "not set / don't know"
  DECLARE_SYNC_AND_ASYNC(
      Status, MultiGet, const ReadOptions& options,
      const InternalKeyComparator& internal_comparator,
      const FileMetaData& file_meta, const MultiGetContext::Range* mget_range,
      const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr,
      HistogramImpl* file_read_hist = nullptr, bool skip_filters = false,
      bool skip_range_deletions = false, int level = -1,
      Cache::Handle* table_handle = nullptr);

  // Evict any entry for the specified file number
  static void Evict(Cache* cache, uint64_t file_number);

  // Query whether specified file number is currently in cache
  static bool HasEntry(Cache* cache, uint64_t file_number);

  // Clean table handle and erase it from the table cache
  // Used in DB close, or the file is not live anymore.
  void EraseHandle(const FileDescriptor& fd, Cache::Handle* handle);

  // Find table reader
  // @param skip_filters Disables loading/accessing the filter block
  // @param level == -1 means not specified
  Status FindTable(
      const ReadOptions& ro, const FileOptions& toptions,
      const InternalKeyComparator& internal_comparator,
      const FileDescriptor& file_fd, Cache::Handle**,
      const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr,
      const bool no_io = false, bool record_read_stats = true,
      HistogramImpl* file_read_hist = nullptr, bool skip_filters = false,
      int level = -1, bool prefetch_index_and_filter_in_cache = true,
      size_t max_file_size_for_l0_meta_pin = 0,
      Temperature file_temperature = Temperature::kUnknown);

  // Get TableReader from a cache handle.
  TableReader* GetTableReaderFromHandle(Cache::Handle* handle);

  // Get the table properties of a given table.
  // @no_io: indicates if we should load table to the cache if it is not present
  //         in table cache yet.
  // @returns: `properties` will be reset on success. Please note that we will
  //            return Status::Incomplete() if table is not present in cache and
  //            we set `no_io` to be true.
  Status GetTableProperties(
      const FileOptions& toptions,
      const InternalKeyComparator& internal_comparator,
      const FileDescriptor& file_meta,
      std::shared_ptr<const TableProperties>* properties,
      const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr,
      bool no_io = false);

  Status ApproximateKeyAnchors(const ReadOptions& ro,
                               const InternalKeyComparator& internal_comparator,
                               const FileDescriptor& file_meta,
                               std::vector<TableReader::Anchor>& anchors);

  // Return total memory usage of the table reader of the file.
  // 0 if table reader of the file is not loaded.
  size_t GetMemoryUsageByTableReader(
      const FileOptions& toptions,
      const InternalKeyComparator& internal_comparator,
      const FileDescriptor& fd,
      const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr);

  // Returns approximated offset of a key in a file represented by fd.
  uint64_t ApproximateOffsetOf(
      const Slice& key, const FileDescriptor& fd, TableReaderCaller caller,
      const InternalKeyComparator& internal_comparator,
      const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr);

  // Returns approximated data size between start and end keys in a file
  // represented by fd (the start key must not be greater than the end key).
  uint64_t ApproximateSize(
      const Slice& start, const Slice& end, const FileDescriptor& fd,
      TableReaderCaller caller,
      const InternalKeyComparator& internal_comparator,
      const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr);

  // Release the handle from a cache
  void ReleaseHandle(Cache::Handle* handle);

  Cache* get_cache() const { return cache_; }

  // Capacity of the backing Cache that indicates infinite TableCache capacity.
  // For example when max_open_files is -1 we set the backing Cache to this.
  static const int kInfiniteCapacity = 0x400000;

  // The tables opened with this TableCache will be immortal, i.e., their
  // lifetime is as long as that of the DB.
  void SetTablesAreImmortal() {
    if (cache_->GetCapacity() >= kInfiniteCapacity) {
      immortal_tables_ = true;
    }
  }

 private:
  // Build a table reader
  Status GetTableReader(
      const ReadOptions& ro, const FileOptions& file_options,
      const InternalKeyComparator& internal_comparator,
      const FileDescriptor& fd, bool sequential_mode, bool record_read_stats,
      HistogramImpl* file_read_hist, std::unique_ptr<TableReader>* table_reader,
      const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr,
      bool skip_filters = false, int level = -1,
      bool prefetch_index_and_filter_in_cache = true,
      size_t max_file_size_for_l0_meta_pin = 0,
      Temperature file_temperature = Temperature::kUnknown);

  // Update the max_covering_tombstone_seq in the GetContext for each key based
  // on the range deletions in the table
  void UpdateRangeTombstoneSeqnums(const ReadOptions& options, TableReader* t,
                                   MultiGetContext::Range& table_range);

  // Create a key prefix for looking up the row cache. The prefix is of the
  // format row_cache_id + fd_number + seq_no. Later, the user key can be
  // appended to form the full key
  void CreateRowCacheKeyPrefix(const ReadOptions& options,
                               const FileDescriptor& fd,
                               const Slice& internal_key,
                               GetContext* get_context, IterKey& row_cache_key);

  // Helper function to lookup the row cache for a key. It appends the
  // user key to row_cache_key at offset prefix_size
  bool GetFromRowCache(const Slice& user_key, IterKey& row_cache_key,
                       size_t prefix_size, GetContext* get_context);

  const ImmutableOptions& ioptions_;
  const FileOptions& file_options_;
  Cache* const cache_;
  std::string row_cache_id_;
  bool immortal_tables_;
  BlockCacheTracer* const block_cache_tracer_;
  Striped<port::Mutex, Slice> loader_mutex_;
  std::shared_ptr<IOTracer> io_tracer_;
  std::string db_session_id_;
};

}  // namespace ROCKSDB_NAMESPACE