mirror of
https://github.com/facebook/rocksdb.git
synced 2024-11-26 07:30:54 +00:00
edd47c5104
Summary: Add a encoding feature of PlainTable to encode PlainTable's keys to save some bytes for the same prefixes. The data format is documented in table/plain_table_factory.h Test Plan: Add unit test coverage in plain_table_db_test Reviewers: yhchiang, igor, dhruba, ljin, haobo Reviewed By: haobo Subscribers: nkg-, leveldb Differential Revision: https://reviews.facebook.net/D18735
270 lines
10 KiB
C++
270 lines
10 KiB
C++
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
#pragma once
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
#include <unordered_map>
|
|
#include <memory>
|
|
#include <vector>
|
|
#include <string>
|
|
#include <stdint.h>
|
|
|
|
#include "db/dbformat.h"
|
|
#include "rocksdb/env.h"
|
|
#include "rocksdb/iterator.h"
|
|
#include "rocksdb/slice_transform.h"
|
|
#include "rocksdb/table.h"
|
|
#include "rocksdb/table_properties.h"
|
|
#include "table/table_reader.h"
|
|
#include "table/plain_table_factory.h"
|
|
#include "util/arena.h"
|
|
#include "util/dynamic_bloom.h"
|
|
|
|
namespace rocksdb {
|
|
|
|
class Block;
|
|
class BlockHandle;
|
|
class Footer;
|
|
struct Options;
|
|
class RandomAccessFile;
|
|
struct ReadOptions;
|
|
class TableCache;
|
|
class TableReader;
|
|
class InternalKeyComparator;
|
|
class PlainTableKeyDecoder;
|
|
|
|
using std::unique_ptr;
|
|
using std::unordered_map;
|
|
extern const uint32_t kPlainTableVariableLength;
|
|
|
|
// Based on following output file format shown in plain_table_factory.h
|
|
// When opening the output file, IndexedTableReader creates a hash table
|
|
// from key prefixes to offset of the output file. IndexedTable will decide
|
|
// whether it points to the data offset of the first key with the key prefix
|
|
// or the offset of it. If there are too many keys share this prefix, it will
|
|
// create a binary search-able index from the suffix to offset on disk.
|
|
//
|
|
// The implementation of IndexedTableReader requires output file is mmaped
|
|
class PlainTableReader: public TableReader {
|
|
public:
|
|
static Status Open(const Options& options, const EnvOptions& soptions,
|
|
const InternalKeyComparator& internal_comparator,
|
|
unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
|
|
unique_ptr<TableReader>* table,
|
|
const int bloom_bits_per_key, double hash_table_ratio,
|
|
size_t index_sparseness, size_t huge_page_tlb_size,
|
|
bool full_scan_mode);
|
|
|
|
Iterator* NewIterator(const ReadOptions&, Arena* arena = nullptr) override;
|
|
|
|
void Prepare(const Slice& target);
|
|
|
|
Status Get(const ReadOptions&, const Slice& key, void* arg,
|
|
bool (*result_handler)(void* arg, const ParsedInternalKey& k,
|
|
const Slice& v, bool),
|
|
void (*mark_key_may_exist)(void*) = nullptr);
|
|
|
|
uint64_t ApproximateOffsetOf(const Slice& key);
|
|
|
|
void SetupForCompaction();
|
|
|
|
std::shared_ptr<const TableProperties> GetTableProperties() const {
|
|
return table_properties_;
|
|
}
|
|
|
|
PlainTableReader(const Options& options, unique_ptr<RandomAccessFile>&& file,
|
|
const EnvOptions& storage_options,
|
|
const InternalKeyComparator& internal_comparator,
|
|
EncodingType encoding_type, uint64_t file_size,
|
|
const TableProperties* table_properties);
|
|
virtual ~PlainTableReader();
|
|
|
|
protected:
|
|
// Check bloom filter to see whether it might contain this prefix.
|
|
// The hash of the prefix is given, since it can be reused for index lookup
|
|
// too.
|
|
virtual bool MatchBloom(uint32_t hash) const;
|
|
|
|
// PopulateIndex() builds index of keys. It must be called before any query
|
|
// to the table.
|
|
//
|
|
// props: the table properties object that need to be stored. Ownership of
|
|
// the object will be passed.
|
|
//
|
|
// index_ contains buckets size of index_size_, each is a
|
|
// 32-bit integer. The lower 31 bits contain an offset value (explained below)
|
|
// and the first bit of the integer indicates type of the offset.
|
|
//
|
|
// +--------------+------------------------------------------------------+
|
|
// | Flag (1 bit) | Offset to binary search buffer or file (31 bits) +
|
|
// +--------------+------------------------------------------------------+
|
|
//
|
|
// Explanation for the "flag bit":
|
|
//
|
|
// 0 indicates that the bucket contains only one prefix (no conflict when
|
|
// hashing this prefix), whose first row starts from this offset of the
|
|
// file.
|
|
// 1 indicates that the bucket contains more than one prefixes, or there
|
|
// are too many rows for one prefix so we need a binary search for it. In
|
|
// this case, the offset indicates the offset of sub_index_ holding the
|
|
// binary search indexes of keys for those rows. Those binary search indexes
|
|
// are organized in this way:
|
|
//
|
|
// The first 4 bytes, indicate how many indexes (N) are stored after it. After
|
|
// it, there are N 32-bit integers, each points of an offset of the file,
|
|
// which
|
|
// points to starting of a row. Those offsets need to be guaranteed to be in
|
|
// ascending order so the keys they are pointing to are also in ascending
|
|
// order
|
|
// to make sure we can use them to do binary searches. Below is visual
|
|
// presentation of a bucket.
|
|
//
|
|
// <begin>
|
|
// number_of_records: varint32
|
|
// record 1 file offset: fixedint32
|
|
// record 2 file offset: fixedint32
|
|
// ....
|
|
// record N file offset: fixedint32
|
|
// <end>
|
|
Status PopulateIndex(TableProperties* props, int bloom_bits_per_key,
|
|
double hash_table_ratio, size_t index_sparseness,
|
|
size_t huge_page_tlb_size);
|
|
Status MmapDataFile();
|
|
|
|
private:
|
|
struct IndexRecord;
|
|
class IndexRecordList;
|
|
|
|
// Plain table maintains an index and a sub index.
|
|
// index is implemented by a hash table.
|
|
// subindex is a big of memory array.
|
|
// For more details about the in-memory index, please refer to:
|
|
// https://github.com/facebook/rocksdb/wiki/PlainTable-Format
|
|
// #wiki-in-memory-index-format
|
|
uint32_t* index_;
|
|
int index_size_ = 0;
|
|
char* sub_index_;
|
|
const InternalKeyComparator internal_comparator_;
|
|
EncodingType encoding_type_;
|
|
// represents plain table's current status.
|
|
Status status_;
|
|
Slice file_data_;
|
|
|
|
// data_start_offset_ and data_end_offset_ defines the range of the
|
|
// sst file that stores data.
|
|
const uint32_t data_start_offset_ = 0;
|
|
const uint32_t data_end_offset_;
|
|
const size_t user_key_len_;
|
|
const SliceTransform* prefix_extractor_;
|
|
|
|
static const size_t kNumInternalBytes = 8;
|
|
static const uint32_t kSubIndexMask = 0x80000000;
|
|
static const size_t kOffsetLen = sizeof(uint32_t);
|
|
static const uint64_t kMaxFileSize = 1u << 31;
|
|
static const size_t kRecordsPerGroup = 256;
|
|
static const int kFullScanModeFlag = -1;
|
|
|
|
// Bloom filter is used to rule out non-existent key
|
|
bool enable_bloom_;
|
|
DynamicBloom bloom_;
|
|
Arena arena_;
|
|
|
|
const Options& options_;
|
|
unique_ptr<RandomAccessFile> file_;
|
|
uint32_t file_size_;
|
|
std::shared_ptr<const TableProperties> table_properties_;
|
|
|
|
bool IsFixedLength() const {
|
|
return user_key_len_ != kPlainTableVariableLength;
|
|
}
|
|
|
|
size_t GetFixedInternalKeyLength() const {
|
|
return user_key_len_ + kNumInternalBytes;
|
|
}
|
|
|
|
friend class TableCache;
|
|
friend class PlainTableIterator;
|
|
|
|
// Internal helper function to generate an IndexRecordList object from all
|
|
// the rows, which contains index records as a list.
|
|
// If bloom_ is not null, all the keys' full-key hash will be added to the
|
|
// bloom filter.
|
|
Status PopulateIndexRecordList(IndexRecordList* record_list,
|
|
int* num_prefixes, int bloom_bits_per_key,
|
|
size_t index_sparseness);
|
|
|
|
// Internal helper function to allocate memory for indexes and bloom filters
|
|
void AllocateIndexAndBloom(int num_prefixes, int bloom_bits_per_key,
|
|
double hash_table_ratio,
|
|
size_t huge_page_tlb_size);
|
|
|
|
// Internal helper function to bucket index record list to hash buckets.
|
|
// bucket_header is a vector of size hash_table_size_, with each entry
|
|
// containing a linklist of IndexRecord hashed to the same bucket, in reverse
|
|
// order.
|
|
// of offsets for the hash, in reversed order.
|
|
// entries_per_bucket is sized of index_size_. The value is how many index
|
|
// records are there in bucket_headers for the same bucket.
|
|
size_t BucketizeIndexesAndFillBloom(
|
|
IndexRecordList* record_list, std::vector<IndexRecord*>* bucket_headers,
|
|
std::vector<uint32_t>* entries_per_bucket);
|
|
|
|
// Internal helper class to fill the indexes and bloom filters to internal
|
|
// data structures. bucket_headers and entries_per_bucket are bucketized
|
|
// indexes and counts generated by BucketizeIndexesAndFillBloom().
|
|
void FillIndexes(const size_t kSubIndexSize,
|
|
const std::vector<IndexRecord*>& bucket_headers,
|
|
const std::vector<uint32_t>& entries_per_bucket,
|
|
size_t huge_page_tlb_size);
|
|
|
|
// Read the key and value at `offset` to parameters for keys, the and
|
|
// `seekable`.
|
|
// On success, `offset` will be updated as the offset for the next key.
|
|
// `parsed_key` will be key in parsed format.
|
|
// if `internal_key` is not empty, it will be filled with key with slice
|
|
// format.
|
|
// if `seekable` is not null, it will return whether we can directly read
|
|
// data using this offset.
|
|
Status Next(PlainTableKeyDecoder* decoder, uint32_t* offset,
|
|
ParsedInternalKey* parsed_key, Slice* internal_key, Slice* value,
|
|
bool* seekable = nullptr) const;
|
|
// Get file offset for key target.
|
|
// return value prefix_matched is set to true if the offset is confirmed
|
|
// for a key with the same prefix as target.
|
|
Status GetOffset(const Slice& target, const Slice& prefix,
|
|
uint32_t prefix_hash, bool& prefix_matched,
|
|
uint32_t* offset) const;
|
|
|
|
Slice GetUserKey(const Slice& key) const {
|
|
return Slice(key.data(), key.size() - 8);
|
|
}
|
|
|
|
Slice GetPrefix(const Slice& target) const {
|
|
assert(target.size() >= 8); // target is internal key
|
|
return GetPrefixFromUserKey(GetUserKey(target));
|
|
}
|
|
|
|
inline Slice GetPrefix(const ParsedInternalKey& target) const;
|
|
|
|
Slice GetPrefixFromUserKey(const Slice& user_key) const {
|
|
if (!IsTotalOrderMode()) {
|
|
return prefix_extractor_->Transform(user_key);
|
|
} else {
|
|
// Use empty slice as prefix if prefix_extractor is not set. In that case,
|
|
// it falls back to pure binary search and total iterator seek is
|
|
// supported.
|
|
return Slice();
|
|
}
|
|
}
|
|
|
|
bool IsTotalOrderMode() const { return (prefix_extractor_ == nullptr); }
|
|
|
|
// No copying allowed
|
|
explicit PlainTableReader(const TableReader&) = delete;
|
|
void operator=(const TableReader&) = delete;
|
|
};
|
|
} // namespace rocksdb
|
|
#endif // ROCKSDB_LITE
|