rocksdb/table/format.h

//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
//  This source code is licensed under both the GPLv2 (found in the
//  COPYING file in the root directory) and Apache 2.0 License
//  (found in the LICENSE.Apache file in the root directory).
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.

#pragma once

#include <array>
#include <cstdint>
#include <string>

#include "file/file_prefetch_buffer.h"
#include "file/random_access_file_reader.h"
#include "memory/memory_allocator_impl.h"
#include "options/cf_options.h"
#include "port/malloc.h"
#include "port/port.h"  // noexcept
#include "rocksdb/slice.h"
#include "rocksdb/status.h"
#include "rocksdb/table.h"
#include "util/hash.h"

namespace ROCKSDB_NAMESPACE {

class RandomAccessFile;
struct ReadOptions;

bool ShouldReportDetailedTime(Env* env, Statistics* stats);

// the length of the magic number in bytes.
constexpr uint32_t kMagicNumberLengthByte = 8;

// BlockHandle is a pointer to the extent of a file that stores a data
// block or a meta block.
class BlockHandle {
 public:
  // Creates a block handle with special values indicating "uninitialized,"
  // distinct from the "null" block handle.
  BlockHandle();
  BlockHandle(uint64_t offset, uint64_t size);

  // The offset of the block in the file.
  uint64_t offset() const { return offset_; }
  void set_offset(uint64_t _offset) { offset_ = _offset; }

  // The size of the stored block
  uint64_t size() const { return size_; }
  void set_size(uint64_t _size) { size_ = _size; }

  void EncodeTo(std::string* dst) const;
  char* EncodeTo(char* dst) const;
  Status DecodeFrom(Slice* input);
  Status DecodeSizeFrom(uint64_t offset, Slice* input);

  // Return a string that contains the copy of handle.
  std::string ToString(bool hex = true) const;

  // if the block handle's offset and size are both "0", we will view it
  // as a null block handle that points to no where.
  bool IsNull() const { return offset_ == 0 && size_ == 0; }

  static const BlockHandle& NullBlockHandle() { return kNullBlockHandle; }

  // Maximum encoding length of a BlockHandle
  static constexpr uint32_t kMaxEncodedLength = 2 * kMaxVarint64Length;

  inline bool operator==(const BlockHandle& rhs) const {
    return offset_ == rhs.offset_ && size_ == rhs.size_;
  }
  inline bool operator!=(const BlockHandle& rhs) const {
    return !(*this == rhs);
  }

 private:
  uint64_t offset_;
  uint64_t size_;

  static const BlockHandle kNullBlockHandle;
};

// Value in block-based table file index.
//
// The index entry for block n is: y -> h, [x],
// where: y is some key between the last key of block n (inclusive) and the
// first key of block n+1 (exclusive); h is BlockHandle pointing to block n;
// x, if present, is the first key of block n (unshortened).
// This struct represents the "h, [x]" part.
struct IndexValue {
  BlockHandle handle;
  // Empty means unknown.
  Slice first_internal_key;

  IndexValue() = default;
  IndexValue(BlockHandle _handle, Slice _first_internal_key)
      : handle(_handle), first_internal_key(_first_internal_key) {}

  // have_first_key indicates whether the `first_internal_key` is used.
  // If previous_handle is not null, delta encoding is used;
  // in this case, the two handles must point to consecutive blocks:
  // handle.offset() ==
  //     previous_handle->offset() + previous_handle->size() + kBlockTrailerSize
  void EncodeTo(std::string* dst, bool have_first_key,
                const BlockHandle* previous_handle) const;
  Status DecodeFrom(Slice* input, bool have_first_key,
                    const BlockHandle* previous_handle);

  std::string ToString(bool hex, bool have_first_key) const;
};

// Given a file's base_context_checksum and an offset of a block within that
// file, choose a 32-bit value that is as unique as possible. This value will
// be added to the standard checksum to get a checksum "with context," or can
// be subtracted to "remove" context. Returns zero (no modifier) if feature is
// disabled with base_context_checksum == 0.
inline uint32_t ChecksumModifierForContext(uint32_t base_context_checksum,
                                           uint64_t offset) {
  // To disable on base_context_checksum == 0, we could write
  // `if (base_context_checksum == 0) return 0;` but benchmarking shows
  // measurable performance penalty vs. this: compute the modifier
  // unconditionally and use an "all or nothing" bit mask to enable
  // or disable.
  uint32_t all_or_nothing = uint32_t{0} - (base_context_checksum != 0);

  // Desired properties:
  // (call this function f(b, o) where b = base and o = offset)
  // 1. Fast
  // 2. f(b1, o) == f(b2, o) iff b1 == b2
  //    (Perfectly preserve base entropy)
  // 3. f(b, o1) == f(b, o2) only if o1 == o2 or |o1-o2| >= 4 billion
  //    (Guaranteed uniqueness for nearby offsets)
  // 3. f(b, o + j * 2**32) == f(b, o + k * 2**32) only if j == k
  //    (Upper bits matter, and *aligned* misplacement fails check)
  // 4. f(b1, o) == f(b2, o + x) then preferably not
  //    f(b1, o + y) == f(b2, o + x + y)
  //    (Avoid linearly correlated matches)
  // 5. f(b, o) == 0 depends on both b and o
  //    (No predictable overlap with non-context checksums)
  uint32_t modifier =
      base_context_checksum ^ (Lower32of64(offset) + Upper32of64(offset));

  return modifier & all_or_nothing;
}

inline uint32_t GetCompressFormatForVersion(uint32_t format_version) {
  // As of format_version 2, we encode compressed block with
  // compress_format_version == 2. Before that, the version is 1.
  // DO NOT CHANGE THIS FUNCTION, it affects disk format
  return format_version >= 2 ? 2 : 1;
}

constexpr uint32_t kLatestFormatVersion = 6;

inline bool IsSupportedFormatVersion(uint32_t version) {
  return version <= kLatestFormatVersion;
}

// Same as having a unique id in footer.
inline bool FormatVersionUsesContextChecksum(uint32_t version) {
  return version >= 6;
}

inline bool FormatVersionUsesIndexHandleInFooter(uint32_t version) {
  return version < 6;
}

// Footer encapsulates the fixed information stored at the tail end of every
// SST file. In general, it should only include things that cannot go
// elsewhere under the metaindex block. For example, checksum_type is
// required for verifying metaindex block checksum (when applicable), but
// index block handle can easily go in metaindex block. See also FooterBuilder
// below.
class Footer {
 public:
  // Create empty. Populate using DecodeFrom.
  Footer() {}

  // Deserialize a footer (populate fields) from `input` and check for various
  // corruptions. `input_offset` is the offset within the target file of
  // `input` buffer, which is needed for verifying format_version >= 6 footer.
  // If enforce_table_magic_number != 0, will return corruption if table magic
  // number is not equal to enforce_table_magic_number.
  Status DecodeFrom(Slice input, uint64_t input_offset,
                    uint64_t enforce_table_magic_number = 0);

  // Table magic number identifies file as RocksDB SST file and which kind of
  // SST format is use.
  uint64_t table_magic_number() const { return table_magic_number_; }

  // A version (footer and more) within a kind of SST. (It would add more
  // unnecessary complexity to separate footer versions and
  // BBTO::format_version.)
  uint32_t format_version() const { return format_version_; }

  // See ChecksumModifierForContext()
  uint32_t base_context_checksum() const { return base_context_checksum_; }

  // Block handle for metaindex block.
  const BlockHandle& metaindex_handle() const { return metaindex_handle_; }

  // Block handle for (top-level) index block.
  // TODO? remove from this struct and only read on decode for legacy cases
  const BlockHandle& index_handle() const { return index_handle_; }

  // Checksum type used in the file, including footer for format version >= 6.
  ChecksumType checksum_type() const {
    return static_cast<ChecksumType>(checksum_type_);
  }

  // Block trailer size used by file with this footer (e.g. 5 for block-based
  // table and 0 for plain table). This is inferred from magic number so
  // not in the serialized form.
  inline size_t GetBlockTrailerSize() const { return block_trailer_size_; }

  // Convert this object to a human readable form
  std::string ToString() const;

  // Encoded lengths of Footers. Bytes for serialized Footer will always be
  // >= kMinEncodedLength and <= kMaxEncodedLength.
  //
  // Footer version 0 (legacy) will always occupy exactly this many bytes.
  // It consists of two block handles, padding, and a magic number.
  static constexpr uint32_t kVersion0EncodedLength =
      2 * BlockHandle::kMaxEncodedLength + kMagicNumberLengthByte;
  static constexpr uint32_t kMinEncodedLength = kVersion0EncodedLength;

  // Footer of versions 1 and higher will always occupy exactly this many
  // bytes. It originally consisted of the checksum type, two block handles,
  // padding (to maximum handle encoding size), a format version number, and a
  // magic number.
  static constexpr uint32_t kNewVersionsEncodedLength =
      1 + 2 * BlockHandle::kMaxEncodedLength + 4 + kMagicNumberLengthByte;
  static constexpr uint32_t kMaxEncodedLength = kNewVersionsEncodedLength;

  static constexpr uint64_t kNullTableMagicNumber = 0;

  static constexpr uint32_t kInvalidFormatVersion = 0xffffffffU;

 private:
  static constexpr int kInvalidChecksumType =
      (1 << (sizeof(ChecksumType) * 8)) | kNoChecksum;

  uint64_t table_magic_number_ = kNullTableMagicNumber;
  uint32_t format_version_ = kInvalidFormatVersion;
  uint32_t base_context_checksum_ = 0;
  BlockHandle metaindex_handle_;
  BlockHandle index_handle_;
  int checksum_type_ = kInvalidChecksumType;
  uint8_t block_trailer_size_ = 0;
};

// Builder for Footer
class FooterBuilder {
 public:
  // Run builder in inputs. This is a single step with lots of parameters for
  // efficiency (based on perf testing).
  // * table_magic_number identifies file as RocksDB SST file and which kind of
  // SST format is use.
  // * format_version is a version for the footer and can also apply to other
  // aspects of the SST file (see BlockBasedTableOptions::format_version).
  // NOTE: To save complexity in the caller, when format_version == 0 and
  // there is a corresponding legacy magic number to the one specified, the
  // legacy magic number will be written for forward compatibility.
  // * footer_offset is the file offset where the footer will be written
  // (for future use).
  // * checksum_type is for formats using block checksums.
  // * index_handle is optional for some SST kinds and (for caller convenience)
  // ignored when format_version >= 6. (Must be added to metaindex in that
  // case.)
  // * unique_id must be specified if format_vesion >= 6 and SST uses block
  // checksums with context. Otherwise, auto-generated if format_vesion >= 6.
  Status Build(uint64_t table_magic_number, uint32_t format_version,
               uint64_t footer_offset, ChecksumType checksum_type,
               const BlockHandle& metaindex_handle,
               const BlockHandle& index_handle = BlockHandle::NullBlockHandle(),
               uint32_t base_context_checksum = 0);

  // After Builder, get a Slice for the serialized Footer, backed by this
  // FooterBuilder.
  const Slice& GetSlice() const {
    assert(slice_.size());
    return slice_;
  }

 private:
  Slice slice_;
  std::array<char, Footer::kMaxEncodedLength> data_;
};

// Read the footer from file
// If enforce_table_magic_number != 0, ReadFooterFromFile() will return
// corruption if table_magic number is not equal to enforce_table_magic_number
Status ReadFooterFromFile(const IOOptions& opts, RandomAccessFileReader* file,
                          FileSystem& fs, FilePrefetchBuffer* prefetch_buffer,
                          uint64_t file_size, Footer* footer,
                          uint64_t enforce_table_magic_number = 0);

// Computes a checksum using the given ChecksumType. Sometimes we need to
// include one more input byte logically at the end but not part of the main
// data buffer. If data_size >= 1, then
// ComputeBuiltinChecksum(type, data, size)
// ==
// ComputeBuiltinChecksumWithLastByte(type, data, size - 1, data[size - 1])
uint32_t ComputeBuiltinChecksum(ChecksumType type, const char* data,
                                size_t size);
uint32_t ComputeBuiltinChecksumWithLastByte(ChecksumType type, const char* data,
                                            size_t size, char last_byte);

// Represents the contents of a block read from an SST file. Depending on how
// it's created, it may or may not own the actual block bytes. As an example,
// BlockContents objects representing data read from mmapped files only point
// into the mmapped region. Depending on context, it might be a serialized
// (potentially compressed) block, including a trailer beyond `size`, or an
// uncompressed block.
//
// Please try to use this terminology when dealing with blocks:
// * "Serialized block" - bytes that go into storage. For block-based table
// (usually the case) this includes the block trailer. Here the `size` does
// not include the trailer, but other places in code might include the trailer
// in the size.
// * "Maybe compressed block" - like a serialized block, but without the
// trailer (or no promise of including a trailer). Must be accompanied by a
// CompressionType in some other variable or field.
// * "Uncompressed block" - "payload" bytes that are either stored with no
// compression, used as input to compression function, or result of
// decompression function.
// * "Parsed block" - an in-memory form of a block in block cache, as it is
// used by the table reader. Different C++ types are used depending on the
// block type (see block_cache.h). Only trivially parsable block types
// use BlockContents as the parsed form.
//
struct BlockContents {
  // Points to block payload (without trailer)
  Slice data;
  CacheAllocationPtr allocation;

#ifndef NDEBUG
  // Whether there is a known trailer after what is pointed to by `data`.
  // See BlockBasedTable::GetCompressionType.
  bool has_trailer = false;
#endif  // NDEBUG

  BlockContents() {}

  // Does not take ownership of the underlying data bytes.
  BlockContents(const Slice& _data) : data(_data) {}

  // Takes ownership of the underlying data bytes.
  BlockContents(CacheAllocationPtr&& _data, size_t _size)
      : data(_data.get(), _size), allocation(std::move(_data)) {}

  // Takes ownership of the underlying data bytes.
  BlockContents(std::unique_ptr<char[]>&& _data, size_t _size)
      : data(_data.get(), _size) {
    allocation.reset(_data.release());
  }

  // Returns whether the object has ownership of the underlying data bytes.
  bool own_bytes() const { return allocation.get() != nullptr; }

  // The additional memory space taken by the block data.
  size_t usable_size() const {
    if (allocation.get() != nullptr) {
      auto allocator = allocation.get_deleter().allocator;
      if (allocator) {
        return allocator->UsableSize(allocation.get(), data.size());
      }
#ifdef ROCKSDB_MALLOC_USABLE_SIZE
      return malloc_usable_size(allocation.get());
#else
      return data.size();
#endif  // ROCKSDB_MALLOC_USABLE_SIZE
    } else {
      return 0;  // no extra memory is occupied by the data
    }
  }

  size_t ApproximateMemoryUsage() const {
    return usable_size() + sizeof(*this);
  }

  BlockContents(BlockContents&& other) noexcept { *this = std::move(other); }

  BlockContents& operator=(BlockContents&& other) {
    data = std::move(other.data);
    allocation = std::move(other.allocation);
#ifndef NDEBUG
    has_trailer = other.has_trailer;
#endif  // NDEBUG
    return *this;
  }
};

// The `data` points to serialized block contents read in from file, which
// must be compressed and include a trailer beyond `size`. A new buffer is
// allocated with the given allocator (or default) and the uncompressed
// contents are returned in `out_contents`.
// format_version is as defined in include/rocksdb/table.h, which is
// used to determine compression format version.
Status UncompressSerializedBlock(const UncompressionInfo& info,
                                 const char* data, size_t size,
                                 BlockContents* out_contents,
                                 uint32_t format_version,
                                 const ImmutableOptions& ioptions,
                                 MemoryAllocator* allocator = nullptr);

// This is a variant of UncompressSerializedBlock that does not expect a
// block trailer beyond `size`. (CompressionType is taken from `info`.)
Status UncompressBlockData(const UncompressionInfo& info, const char* data,
                           size_t size, BlockContents* out_contents,
                           uint32_t format_version,
                           const ImmutableOptions& ioptions,
                           MemoryAllocator* allocator = nullptr);

// Replace db_host_id contents with the real hostname if necessary
Status ReifyDbHostIdProperty(Env* env, std::string* db_host_id);

// Implementation details follow.  Clients should ignore,

// TODO(andrewkr): we should prefer one way of representing a null/uninitialized
// BlockHandle. Currently we use zeros for null and use negation-of-zeros for
// uninitialized.
inline BlockHandle::BlockHandle() : BlockHandle(~uint64_t{0}, ~uint64_t{0}) {}

inline BlockHandle::BlockHandle(uint64_t _offset, uint64_t _size)
    : offset_(_offset), size_(_size) {}

}  // namespace ROCKSDB_NAMESPACE