mirror of https://github.com/facebook/rocksdb.git
440 lines
18 KiB
C++
440 lines
18 KiB
C++
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
//
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
#pragma once
|
|
|
|
#include <array>
|
|
#include <cstdint>
|
|
#include <string>
|
|
|
|
#include "file/file_prefetch_buffer.h"
|
|
#include "file/random_access_file_reader.h"
|
|
#include "memory/memory_allocator_impl.h"
|
|
#include "options/cf_options.h"
|
|
#include "port/malloc.h"
|
|
#include "port/port.h" // noexcept
|
|
#include "rocksdb/slice.h"
|
|
#include "rocksdb/status.h"
|
|
#include "rocksdb/table.h"
|
|
#include "util/hash.h"
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
class RandomAccessFile;
|
|
struct ReadOptions;
|
|
|
|
bool ShouldReportDetailedTime(Env* env, Statistics* stats);
|
|
|
|
// the length of the magic number in bytes.
|
|
constexpr uint32_t kMagicNumberLengthByte = 8;
|
|
|
|
extern const uint64_t kLegacyBlockBasedTableMagicNumber;
|
|
extern const uint64_t kBlockBasedTableMagicNumber;
|
|
|
|
extern const uint64_t kLegacyPlainTableMagicNumber;
|
|
extern const uint64_t kPlainTableMagicNumber;
|
|
|
|
extern const uint64_t kCuckooTableMagicNumber;
|
|
|
|
// BlockHandle is a pointer to the extent of a file that stores a data
|
|
// block or a meta block.
|
|
class BlockHandle {
|
|
public:
|
|
// Creates a block handle with special values indicating "uninitialized,"
|
|
// distinct from the "null" block handle.
|
|
BlockHandle();
|
|
BlockHandle(uint64_t offset, uint64_t size);
|
|
|
|
// The offset of the block in the file.
|
|
uint64_t offset() const { return offset_; }
|
|
void set_offset(uint64_t _offset) { offset_ = _offset; }
|
|
|
|
// The size of the stored block
|
|
uint64_t size() const { return size_; }
|
|
void set_size(uint64_t _size) { size_ = _size; }
|
|
|
|
void EncodeTo(std::string* dst) const;
|
|
char* EncodeTo(char* dst) const;
|
|
Status DecodeFrom(Slice* input);
|
|
Status DecodeSizeFrom(uint64_t offset, Slice* input);
|
|
|
|
// Return a string that contains the copy of handle.
|
|
std::string ToString(bool hex = true) const;
|
|
|
|
// if the block handle's offset and size are both "0", we will view it
|
|
// as a null block handle that points to no where.
|
|
bool IsNull() const { return offset_ == 0 && size_ == 0; }
|
|
|
|
static const BlockHandle& NullBlockHandle() { return kNullBlockHandle; }
|
|
|
|
// Maximum encoding length of a BlockHandle
|
|
static constexpr uint32_t kMaxEncodedLength = 2 * kMaxVarint64Length;
|
|
|
|
inline bool operator==(const BlockHandle& rhs) const {
|
|
return offset_ == rhs.offset_ && size_ == rhs.size_;
|
|
}
|
|
inline bool operator!=(const BlockHandle& rhs) const {
|
|
return !(*this == rhs);
|
|
}
|
|
|
|
private:
|
|
uint64_t offset_;
|
|
uint64_t size_;
|
|
|
|
static const BlockHandle kNullBlockHandle;
|
|
};
|
|
|
|
// Value in block-based table file index.
|
|
//
|
|
// The index entry for block n is: y -> h, [x],
|
|
// where: y is some key between the last key of block n (inclusive) and the
|
|
// first key of block n+1 (exclusive); h is BlockHandle pointing to block n;
|
|
// x, if present, is the first key of block n (unshortened).
|
|
// This struct represents the "h, [x]" part.
|
|
struct IndexValue {
|
|
BlockHandle handle;
|
|
// Empty means unknown.
|
|
Slice first_internal_key;
|
|
|
|
IndexValue() = default;
|
|
IndexValue(BlockHandle _handle, Slice _first_internal_key)
|
|
: handle(_handle), first_internal_key(_first_internal_key) {}
|
|
|
|
// have_first_key indicates whether the `first_internal_key` is used.
|
|
// If previous_handle is not null, delta encoding is used;
|
|
// in this case, the two handles must point to consecutive blocks:
|
|
// handle.offset() ==
|
|
// previous_handle->offset() + previous_handle->size() + kBlockTrailerSize
|
|
void EncodeTo(std::string* dst, bool have_first_key,
|
|
const BlockHandle* previous_handle) const;
|
|
Status DecodeFrom(Slice* input, bool have_first_key,
|
|
const BlockHandle* previous_handle);
|
|
|
|
std::string ToString(bool hex, bool have_first_key) const;
|
|
};
|
|
|
|
// Given a file's base_context_checksum and an offset of a block within that
|
|
// file, choose a 32-bit value that is as unique as possible. This value will
|
|
// be added to the standard checksum to get a checksum "with context," or can
|
|
// be subtracted to "remove" context. Returns zero (no modifier) if feature is
|
|
// disabled with base_context_checksum == 0.
|
|
inline uint32_t ChecksumModifierForContext(uint32_t base_context_checksum,
|
|
uint64_t offset) {
|
|
// To disable on base_context_checksum == 0, we could write
|
|
// `if (base_context_checksum == 0) return 0;` but benchmarking shows
|
|
// measurable performance penalty vs. this: compute the modifier
|
|
// unconditionally and use an "all or nothing" bit mask to enable
|
|
// or disable.
|
|
uint32_t all_or_nothing = uint32_t{0} - (base_context_checksum != 0);
|
|
|
|
// Desired properties:
|
|
// (call this function f(b, o) where b = base and o = offset)
|
|
// 1. Fast
|
|
// 2. f(b1, o) == f(b2, o) iff b1 == b2
|
|
// (Perfectly preserve base entropy)
|
|
// 3. f(b, o1) == f(b, o2) only if o1 == o2 or |o1-o2| >= 4 billion
|
|
// (Guaranteed uniqueness for nearby offsets)
|
|
// 3. f(b, o + j * 2**32) == f(b, o + k * 2**32) only if j == k
|
|
// (Upper bits matter, and *aligned* misplacement fails check)
|
|
// 4. f(b1, o) == f(b2, o + x) then preferably not
|
|
// f(b1, o + y) == f(b2, o + x + y)
|
|
// (Avoid linearly correlated matches)
|
|
// 5. f(b, o) == 0 depends on both b and o
|
|
// (No predictable overlap with non-context checksums)
|
|
uint32_t modifier =
|
|
base_context_checksum ^ (Lower32of64(offset) + Upper32of64(offset));
|
|
|
|
return modifier & all_or_nothing;
|
|
}
|
|
|
|
inline uint32_t GetCompressFormatForVersion(uint32_t format_version) {
|
|
// As of format_version 2, we encode compressed block with
|
|
// compress_format_version == 2. Before that, the version is 1.
|
|
// DO NOT CHANGE THIS FUNCTION, it affects disk format
|
|
return format_version >= 2 ? 2 : 1;
|
|
}
|
|
|
|
constexpr uint32_t kLatestFormatVersion = 6;
|
|
|
|
inline bool IsSupportedFormatVersion(uint32_t version) {
|
|
return version <= kLatestFormatVersion;
|
|
}
|
|
|
|
// Same as having a unique id in footer.
|
|
inline bool FormatVersionUsesContextChecksum(uint32_t version) {
|
|
return version >= 6;
|
|
}
|
|
|
|
inline bool FormatVersionUsesIndexHandleInFooter(uint32_t version) {
|
|
return version < 6;
|
|
}
|
|
|
|
// Footer encapsulates the fixed information stored at the tail end of every
|
|
// SST file. In general, it should only include things that cannot go
|
|
// elsewhere under the metaindex block. For example, checksum_type is
|
|
// required for verifying metaindex block checksum (when applicable), but
|
|
// index block handle can easily go in metaindex block. See also FooterBuilder
|
|
// below.
|
|
class Footer {
|
|
public:
|
|
// Create empty. Populate using DecodeFrom.
|
|
Footer() {}
|
|
|
|
// Deserialize a footer (populate fields) from `input` and check for various
|
|
// corruptions. `input_offset` is the offset within the target file of
|
|
// `input` buffer, which is needed for verifying format_version >= 6 footer.
|
|
// If enforce_table_magic_number != 0, will return corruption if table magic
|
|
// number is not equal to enforce_table_magic_number.
|
|
Status DecodeFrom(Slice input, uint64_t input_offset,
|
|
uint64_t enforce_table_magic_number = 0);
|
|
|
|
// Table magic number identifies file as RocksDB SST file and which kind of
|
|
// SST format is use.
|
|
uint64_t table_magic_number() const { return table_magic_number_; }
|
|
|
|
// A version (footer and more) within a kind of SST. (It would add more
|
|
// unnecessary complexity to separate footer versions and
|
|
// BBTO::format_version.)
|
|
uint32_t format_version() const { return format_version_; }
|
|
|
|
// See ChecksumModifierForContext()
|
|
uint32_t base_context_checksum() const { return base_context_checksum_; }
|
|
|
|
// Block handle for metaindex block.
|
|
const BlockHandle& metaindex_handle() const { return metaindex_handle_; }
|
|
|
|
// Block handle for (top-level) index block.
|
|
// TODO? remove from this struct and only read on decode for legacy cases
|
|
const BlockHandle& index_handle() const { return index_handle_; }
|
|
|
|
// Checksum type used in the file, including footer for format version >= 6.
|
|
ChecksumType checksum_type() const {
|
|
return static_cast<ChecksumType>(checksum_type_);
|
|
}
|
|
|
|
// Block trailer size used by file with this footer (e.g. 5 for block-based
|
|
// table and 0 for plain table). This is inferred from magic number so
|
|
// not in the serialized form.
|
|
inline size_t GetBlockTrailerSize() const { return block_trailer_size_; }
|
|
|
|
// Convert this object to a human readable form
|
|
std::string ToString() const;
|
|
|
|
// Encoded lengths of Footers. Bytes for serialized Footer will always be
|
|
// >= kMinEncodedLength and <= kMaxEncodedLength.
|
|
//
|
|
// Footer version 0 (legacy) will always occupy exactly this many bytes.
|
|
// It consists of two block handles, padding, and a magic number.
|
|
static constexpr uint32_t kVersion0EncodedLength =
|
|
2 * BlockHandle::kMaxEncodedLength + kMagicNumberLengthByte;
|
|
static constexpr uint32_t kMinEncodedLength = kVersion0EncodedLength;
|
|
|
|
// Footer of versions 1 and higher will always occupy exactly this many
|
|
// bytes. It originally consisted of the checksum type, two block handles,
|
|
// padding (to maximum handle encoding size), a format version number, and a
|
|
// magic number.
|
|
static constexpr uint32_t kNewVersionsEncodedLength =
|
|
1 + 2 * BlockHandle::kMaxEncodedLength + 4 + kMagicNumberLengthByte;
|
|
static constexpr uint32_t kMaxEncodedLength = kNewVersionsEncodedLength;
|
|
|
|
static constexpr uint64_t kNullTableMagicNumber = 0;
|
|
|
|
static constexpr uint32_t kInvalidFormatVersion = 0xffffffffU;
|
|
|
|
private:
|
|
static constexpr int kInvalidChecksumType =
|
|
(1 << (sizeof(ChecksumType) * 8)) | kNoChecksum;
|
|
|
|
uint64_t table_magic_number_ = kNullTableMagicNumber;
|
|
uint32_t format_version_ = kInvalidFormatVersion;
|
|
uint32_t base_context_checksum_ = 0;
|
|
BlockHandle metaindex_handle_;
|
|
BlockHandle index_handle_;
|
|
int checksum_type_ = kInvalidChecksumType;
|
|
uint8_t block_trailer_size_ = 0;
|
|
};
|
|
|
|
// Builder for Footer
|
|
class FooterBuilder {
|
|
public:
|
|
// Run builder in inputs. This is a single step with lots of parameters for
|
|
// efficiency (based on perf testing).
|
|
// * table_magic_number identifies file as RocksDB SST file and which kind of
|
|
// SST format is use.
|
|
// * format_version is a version for the footer and can also apply to other
|
|
// aspects of the SST file (see BlockBasedTableOptions::format_version).
|
|
// NOTE: To save complexity in the caller, when format_version == 0 and
|
|
// there is a corresponding legacy magic number to the one specified, the
|
|
// legacy magic number will be written for forward compatibility.
|
|
// * footer_offset is the file offset where the footer will be written
|
|
// (for future use).
|
|
// * checksum_type is for formats using block checksums.
|
|
// * index_handle is optional for some SST kinds and (for caller convenience)
|
|
// ignored when format_version >= 6. (Must be added to metaindex in that
|
|
// case.)
|
|
// * unique_id must be specified if format_vesion >= 6 and SST uses block
|
|
// checksums with context. Otherwise, auto-generated if format_vesion >= 6.
|
|
Status Build(uint64_t table_magic_number, uint32_t format_version,
|
|
uint64_t footer_offset, ChecksumType checksum_type,
|
|
const BlockHandle& metaindex_handle,
|
|
const BlockHandle& index_handle = BlockHandle::NullBlockHandle(),
|
|
uint32_t base_context_checksum = 0);
|
|
|
|
// After Builder, get a Slice for the serialized Footer, backed by this
|
|
// FooterBuilder.
|
|
const Slice& GetSlice() const {
|
|
assert(slice_.size());
|
|
return slice_;
|
|
}
|
|
|
|
private:
|
|
Slice slice_;
|
|
std::array<char, Footer::kMaxEncodedLength> data_;
|
|
};
|
|
|
|
// Read the footer from file
|
|
// If enforce_table_magic_number != 0, ReadFooterFromFile() will return
|
|
// corruption if table_magic number is not equal to enforce_table_magic_number
|
|
Status ReadFooterFromFile(const IOOptions& opts, RandomAccessFileReader* file,
|
|
FileSystem& fs, FilePrefetchBuffer* prefetch_buffer,
|
|
uint64_t file_size, Footer* footer,
|
|
uint64_t enforce_table_magic_number = 0);
|
|
|
|
// Computes a checksum using the given ChecksumType. Sometimes we need to
|
|
// include one more input byte logically at the end but not part of the main
|
|
// data buffer. If data_size >= 1, then
|
|
// ComputeBuiltinChecksum(type, data, size)
|
|
// ==
|
|
// ComputeBuiltinChecksumWithLastByte(type, data, size - 1, data[size - 1])
|
|
uint32_t ComputeBuiltinChecksum(ChecksumType type, const char* data,
|
|
size_t size);
|
|
uint32_t ComputeBuiltinChecksumWithLastByte(ChecksumType type, const char* data,
|
|
size_t size, char last_byte);
|
|
|
|
// Represents the contents of a block read from an SST file. Depending on how
|
|
// it's created, it may or may not own the actual block bytes. As an example,
|
|
// BlockContents objects representing data read from mmapped files only point
|
|
// into the mmapped region. Depending on context, it might be a serialized
|
|
// (potentially compressed) block, including a trailer beyond `size`, or an
|
|
// uncompressed block.
|
|
//
|
|
// Please try to use this terminology when dealing with blocks:
|
|
// * "Serialized block" - bytes that go into storage. For block-based table
|
|
// (usually the case) this includes the block trailer. Here the `size` does
|
|
// not include the trailer, but other places in code might include the trailer
|
|
// in the size.
|
|
// * "Maybe compressed block" - like a serialized block, but without the
|
|
// trailer (or no promise of including a trailer). Must be accompanied by a
|
|
// CompressionType in some other variable or field.
|
|
// * "Uncompressed block" - "payload" bytes that are either stored with no
|
|
// compression, used as input to compression function, or result of
|
|
// decompression function.
|
|
// * "Parsed block" - an in-memory form of a block in block cache, as it is
|
|
// used by the table reader. Different C++ types are used depending on the
|
|
// block type (see block_cache.h). Only trivially parsable block types
|
|
// use BlockContents as the parsed form.
|
|
//
|
|
struct BlockContents {
|
|
// Points to block payload (without trailer)
|
|
Slice data;
|
|
CacheAllocationPtr allocation;
|
|
|
|
#ifndef NDEBUG
|
|
// Whether there is a known trailer after what is pointed to by `data`.
|
|
// See BlockBasedTable::GetCompressionType.
|
|
bool has_trailer = false;
|
|
#endif // NDEBUG
|
|
|
|
BlockContents() {}
|
|
|
|
// Does not take ownership of the underlying data bytes.
|
|
BlockContents(const Slice& _data) : data(_data) {}
|
|
|
|
// Takes ownership of the underlying data bytes.
|
|
BlockContents(CacheAllocationPtr&& _data, size_t _size)
|
|
: data(_data.get(), _size), allocation(std::move(_data)) {}
|
|
|
|
// Takes ownership of the underlying data bytes.
|
|
BlockContents(std::unique_ptr<char[]>&& _data, size_t _size)
|
|
: data(_data.get(), _size) {
|
|
allocation.reset(_data.release());
|
|
}
|
|
|
|
// Returns whether the object has ownership of the underlying data bytes.
|
|
bool own_bytes() const { return allocation.get() != nullptr; }
|
|
|
|
// The additional memory space taken by the block data.
|
|
size_t usable_size() const {
|
|
if (allocation.get() != nullptr) {
|
|
auto allocator = allocation.get_deleter().allocator;
|
|
if (allocator) {
|
|
return allocator->UsableSize(allocation.get(), data.size());
|
|
}
|
|
#ifdef ROCKSDB_MALLOC_USABLE_SIZE
|
|
return malloc_usable_size(allocation.get());
|
|
#else
|
|
return data.size();
|
|
#endif // ROCKSDB_MALLOC_USABLE_SIZE
|
|
} else {
|
|
return 0; // no extra memory is occupied by the data
|
|
}
|
|
}
|
|
|
|
size_t ApproximateMemoryUsage() const {
|
|
return usable_size() + sizeof(*this);
|
|
}
|
|
|
|
BlockContents(BlockContents&& other) noexcept { *this = std::move(other); }
|
|
|
|
BlockContents& operator=(BlockContents&& other) {
|
|
data = std::move(other.data);
|
|
allocation = std::move(other.allocation);
|
|
#ifndef NDEBUG
|
|
has_trailer = other.has_trailer;
|
|
#endif // NDEBUG
|
|
return *this;
|
|
}
|
|
};
|
|
|
|
// The `data` points to serialized block contents read in from file, which
|
|
// must be compressed and include a trailer beyond `size`. A new buffer is
|
|
// allocated with the given allocator (or default) and the uncompressed
|
|
// contents are returned in `out_contents`.
|
|
// format_version is as defined in include/rocksdb/table.h, which is
|
|
// used to determine compression format version.
|
|
Status UncompressSerializedBlock(const UncompressionInfo& info,
|
|
const char* data, size_t size,
|
|
BlockContents* out_contents,
|
|
uint32_t format_version,
|
|
const ImmutableOptions& ioptions,
|
|
MemoryAllocator* allocator = nullptr);
|
|
|
|
// This is a variant of UncompressSerializedBlock that does not expect a
|
|
// block trailer beyond `size`. (CompressionType is taken from `info`.)
|
|
Status UncompressBlockData(const UncompressionInfo& info, const char* data,
|
|
size_t size, BlockContents* out_contents,
|
|
uint32_t format_version,
|
|
const ImmutableOptions& ioptions,
|
|
MemoryAllocator* allocator = nullptr);
|
|
|
|
// Replace db_host_id contents with the real hostname if necessary
|
|
Status ReifyDbHostIdProperty(Env* env, std::string* db_host_id);
|
|
|
|
// Implementation details follow. Clients should ignore,
|
|
|
|
// TODO(andrewkr): we should prefer one way of representing a null/uninitialized
|
|
// BlockHandle. Currently we use zeros for null and use negation-of-zeros for
|
|
// uninitialized.
|
|
inline BlockHandle::BlockHandle() : BlockHandle(~uint64_t{0}, ~uint64_t{0}) {}
|
|
|
|
inline BlockHandle::BlockHandle(uint64_t _offset, uint64_t _size)
|
|
: offset_(_offset), size_(_size) {}
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|