mirror of
https://github.com/facebook/rocksdb.git
synced 2024-11-29 00:34:03 +00:00
f9cfc6a808
Summary: …ta blocks During MyShadow testing, ajkr helped me find out that with partitioned index and dictionary compression enabled, `PartitionedIndexIterator::InitPartitionedIndexBlock()` spent considerable amount of time (1-2% CPU) on fetching uncompression dictionary. Fetching uncompression dict was not needed since the index blocks were not compressed (and even if they were, they use empty dictionary). This should only affect use cases with partitioned index, dictionary compression and without uncompression dictionary pinned. This PR updates NewDataBlockIterator to not fetch uncompression dictionary when it is not for data blocks. Pull Request resolved: https://github.com/facebook/rocksdb/pull/10310 Test Plan: 1. `make check` 2. Perf benchmark: 1.5% (143950 -> 146176) improvement in op/sec for partitioned index + dict compression benchmark. For default config without partitioned index and without dict compression, there is no regression in readrandom perf from multiple runs of db_bench. ``` # Set up for partitioned index with dictionary compression TEST_TMPDIR=/dev/shm ./db_bench_main -benchmarks=filluniquerandom,compact -max_background_jobs=24 -memtablerep=vector -allow_concurrent_memtable_write=false -partition_index=true -compression_max_dict_bytes=16384 -compression_zstd_max_train_bytes=1638400 # Pre PR TEST_TMPDIR=/dev/shm ./db_bench_main -use_existing_db=true -benchmarks=readrandom[-X50] -partition_index=true readrandom [AVG 50 runs] : 143950 (± 1108) ops/sec; 15.9 (± 0.1) MB/sec readrandom [MEDIAN 50 runs] : 144406 ops/sec; 16.0 MB/sec # Post PR TEST_TMPDIR=/dev/shm ./db_bench_opt -use_existing_db=true -benchmarks=readrandom[-X50] -partition_index=true readrandom [AVG 50 runs] : 146176 (± 1121) ops/sec; 16.2 (± 0.1) MB/sec readrandom [MEDIAN 50 runs] : 146014 ops/sec; 16.2 MB/sec # Set up for no partitioned index and no dictionary compression TEST_TMPDIR=/dev/shm/baseline ./db_bench_main -benchmarks=filluniquerandom,compact -max_background_jobs=24 -memtablerep=vector -allow_concurrent_memtable_write=false # Pre PR TEST_TMPDIR=/dev/shm/baseline/ ./db_bench_main --use_existing_db=true "--benchmarks=readrandom[-X50]" readrandom [AVG 50 runs] : 158546 (± 1000) ops/sec; 17.5 (± 0.1) MB/sec readrandom [MEDIAN 50 runs] : 158280 ops/sec; 17.5 MB/sec # Post PR TEST_TMPDIR=/dev/shm/baseline/ ./db_bench_opt --use_existing_db=true "--benchmarks=readrandom[-X50]" readrandom [AVG 50 runs] : 161061 (± 1520) ops/sec; 17.8 (± 0.2) MB/sec readrandom [MEDIAN 50 runs] : 161596 ops/sec; 17.9 MB/sec ``` Reviewed By: ajkr Differential Revision: D37631358 Pulled By: cbi42 fbshipit-source-id: 6ca2665e270e63871968e061ba4a99d3136785d9
173 lines
6.8 KiB
C++
173 lines
6.8 KiB
C++
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
//
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
#pragma once
|
|
#include "table/block_based/block_based_table_reader.h"
|
|
|
|
#include "table/block_based/reader_common.h"
|
|
|
|
// The file contains some member functions of BlockBasedTable that
|
|
// cannot be implemented in block_based_table_reader.cc because
|
|
// it's called by other files (e.g. block_based_iterator.h) and
|
|
// are templates.
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
// Convert an index iterator value (i.e., an encoded BlockHandle)
|
|
// into an iterator over the contents of the corresponding block.
|
|
// If input_iter is null, new a iterator
|
|
// If input_iter is not null, update this iter and return it
|
|
template <typename TBlockIter>
|
|
TBlockIter* BlockBasedTable::NewDataBlockIterator(
|
|
const ReadOptions& ro, const BlockHandle& handle, TBlockIter* input_iter,
|
|
BlockType block_type, GetContext* get_context,
|
|
BlockCacheLookupContext* lookup_context,
|
|
FilePrefetchBuffer* prefetch_buffer, bool for_compaction, bool async_read,
|
|
Status& s) const {
|
|
PERF_TIMER_GUARD(new_table_block_iter_nanos);
|
|
|
|
TBlockIter* iter = input_iter != nullptr ? input_iter : new TBlockIter;
|
|
if (!s.ok()) {
|
|
iter->Invalidate(s);
|
|
return iter;
|
|
}
|
|
|
|
CachableEntry<Block> block;
|
|
if (rep_->uncompression_dict_reader && block_type == BlockType::kData) {
|
|
CachableEntry<UncompressionDict> uncompression_dict;
|
|
const bool no_io = (ro.read_tier == kBlockCacheTier);
|
|
s = rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary(
|
|
prefetch_buffer, no_io, ro.verify_checksums, get_context,
|
|
lookup_context, &uncompression_dict);
|
|
if (!s.ok()) {
|
|
iter->Invalidate(s);
|
|
return iter;
|
|
}
|
|
const UncompressionDict& dict = uncompression_dict.GetValue()
|
|
? *uncompression_dict.GetValue()
|
|
: UncompressionDict::GetEmptyDict();
|
|
s = RetrieveBlock(prefetch_buffer, ro, handle, dict, &block, block_type,
|
|
get_context, lookup_context, for_compaction,
|
|
/* use_cache */ true, /* wait_for_cache */ true,
|
|
async_read);
|
|
} else {
|
|
s = RetrieveBlock(
|
|
prefetch_buffer, ro, handle, UncompressionDict::GetEmptyDict(), &block,
|
|
block_type, get_context, lookup_context, for_compaction,
|
|
/* use_cache */ true, /* wait_for_cache */ true, async_read);
|
|
}
|
|
|
|
if (s.IsTryAgain() && async_read) {
|
|
return iter;
|
|
}
|
|
|
|
if (!s.ok()) {
|
|
assert(block.IsEmpty());
|
|
iter->Invalidate(s);
|
|
return iter;
|
|
}
|
|
|
|
assert(block.GetValue() != nullptr);
|
|
|
|
// Block contents are pinned and it is still pinned after the iterator
|
|
// is destroyed as long as cleanup functions are moved to another object,
|
|
// when:
|
|
// 1. block cache handle is set to be released in cleanup function, or
|
|
// 2. it's pointing to immortal source. If own_bytes is true then we are
|
|
// not reading data from the original source, whether immortal or not.
|
|
// Otherwise, the block is pinned iff the source is immortal.
|
|
const bool block_contents_pinned =
|
|
block.IsCached() ||
|
|
(!block.GetValue()->own_bytes() && rep_->immortal_table);
|
|
iter = InitBlockIterator<TBlockIter>(rep_, block.GetValue(), block_type, iter,
|
|
block_contents_pinned);
|
|
|
|
if (!block.IsCached()) {
|
|
if (!ro.fill_cache) {
|
|
Cache* const block_cache = rep_->table_options.block_cache.get();
|
|
if (block_cache) {
|
|
// insert a dummy record to block cache to track the memory usage
|
|
Cache::Handle* cache_handle = nullptr;
|
|
CacheKey key = CacheKey::CreateUniqueForCacheLifetime(block_cache);
|
|
s = block_cache->Insert(key.AsSlice(), nullptr,
|
|
block.GetValue()->ApproximateMemoryUsage(),
|
|
nullptr, &cache_handle);
|
|
|
|
if (s.ok()) {
|
|
assert(cache_handle != nullptr);
|
|
iter->RegisterCleanup(&ForceReleaseCachedEntry, block_cache,
|
|
cache_handle);
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
iter->SetCacheHandle(block.GetCacheHandle());
|
|
}
|
|
|
|
block.TransferTo(iter);
|
|
|
|
return iter;
|
|
}
|
|
|
|
// Convert an uncompressed data block (i.e CachableEntry<Block>)
|
|
// into an iterator over the contents of the corresponding block.
|
|
// If input_iter is null, new a iterator
|
|
// If input_iter is not null, update this iter and return it
|
|
template <typename TBlockIter>
|
|
TBlockIter* BlockBasedTable::NewDataBlockIterator(const ReadOptions& ro,
|
|
CachableEntry<Block>& block,
|
|
TBlockIter* input_iter,
|
|
Status s) const {
|
|
PERF_TIMER_GUARD(new_table_block_iter_nanos);
|
|
|
|
TBlockIter* iter = input_iter != nullptr ? input_iter : new TBlockIter;
|
|
if (!s.ok()) {
|
|
iter->Invalidate(s);
|
|
return iter;
|
|
}
|
|
|
|
assert(block.GetValue() != nullptr);
|
|
// Block contents are pinned and it is still pinned after the iterator
|
|
// is destroyed as long as cleanup functions are moved to another object,
|
|
// when:
|
|
// 1. block cache handle is set to be released in cleanup function, or
|
|
// 2. it's pointing to immortal source. If own_bytes is true then we are
|
|
// not reading data from the original source, whether immortal or not.
|
|
// Otherwise, the block is pinned iff the source is immortal.
|
|
const bool block_contents_pinned =
|
|
block.IsCached() ||
|
|
(!block.GetValue()->own_bytes() && rep_->immortal_table);
|
|
iter = InitBlockIterator<TBlockIter>(rep_, block.GetValue(), BlockType::kData,
|
|
iter, block_contents_pinned);
|
|
|
|
if (!block.IsCached()) {
|
|
if (!ro.fill_cache) {
|
|
Cache* const block_cache = rep_->table_options.block_cache.get();
|
|
if (block_cache) {
|
|
// insert a dummy record to block cache to track the memory usage
|
|
Cache::Handle* cache_handle = nullptr;
|
|
CacheKey key = CacheKey::CreateUniqueForCacheLifetime(block_cache);
|
|
s = block_cache->Insert(key.AsSlice(), nullptr,
|
|
block.GetValue()->ApproximateMemoryUsage(),
|
|
nullptr, &cache_handle);
|
|
|
|
if (s.ok()) {
|
|
assert(cache_handle != nullptr);
|
|
iter->RegisterCleanup(&ForceReleaseCachedEntry, block_cache,
|
|
cache_handle);
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
iter->SetCacheHandle(block.GetCacheHandle());
|
|
}
|
|
|
|
block.TransferTo(iter);
|
|
return iter;
|
|
}
|
|
} // namespace ROCKSDB_NAMESPACE
|