rocksdb/table/format.cc

583 lines
21 KiB
C++
Raw Normal View History

// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "table/format.h"
#include <cinttypes>
#include <string>
#include "block_fetcher.h"
#include "file/random_access_file_reader.h"
#include "memory/memory_allocator.h"
#include "monitoring/perf_context_imp.h"
#include "monitoring/statistics.h"
#include "options/options_helper.h"
#include "rocksdb/env.h"
#include "rocksdb/options.h"
#include "rocksdb/table.h"
#include "table/block_based/block.h"
#include "table/block_based/block_based_table_reader.h"
#include "table/persistent_cache_helper.h"
#include "util/cast_util.h"
#include "util/coding.h"
#include "util/compression.h"
#include "util/crc32c.h"
#include "util/hash.h"
#include "util/stop_watch.h"
#include "util/string_util.h"
#include "util/xxhash.h"
namespace ROCKSDB_NAMESPACE {
extern const uint64_t kLegacyBlockBasedTableMagicNumber;
extern const uint64_t kBlockBasedTableMagicNumber;
2014-05-08 00:45:27 +00:00
#ifndef ROCKSDB_LITE
extern const uint64_t kLegacyPlainTableMagicNumber;
extern const uint64_t kPlainTableMagicNumber;
2014-05-08 00:45:27 +00:00
#else
// ROCKSDB_LITE doesn't have plain table
const uint64_t kLegacyPlainTableMagicNumber = 0;
const uint64_t kPlainTableMagicNumber = 0;
#endif
const char* kHostnameForDbHostId = "__hostname__";
bool ShouldReportDetailedTime(Env* env, Statistics* stats) {
return env != nullptr && stats != nullptr &&
stats->get_stats_level() > kExceptDetailedTimers;
}
void BlockHandle::EncodeTo(std::string* dst) const {
// Sanity check that all fields have been set
assert(offset_ != ~uint64_t{0});
assert(size_ != ~uint64_t{0});
Miscellaneous performance improvements Summary: I was investigating performance issues in the SstFileWriter and found all of the following: - The SstFileWriter::Add() function created a local InternalKey every time it was called generating a allocation and free each time. Changed to have an InternalKey member variable that can be reset with the new InternalKey::Set() function. - In SstFileWriter::Add() the smallest_key and largest_key values were assigned the result of a ToString() call, but it is simpler to just assign them directly from the user's key. - The Slice class had no move constructor so each time one was returned from a function a new one had to be allocated, the old data copied to the new, and the old one was freed. I added the move constructor which also required a copy constructor and assignment operator. - The BlockBuilder::CurrentSizeEstimate() function calculates the current estimate size, but was being called 2 or 3 times for each key added. I changed the class to maintain a running estimate (equal to the original calculation) so that the function can return an already calculated value. - The code in BlockBuilder::Add() that calculated the shared bytes between the last key and the new key duplicated what Slice::difference_offset does, so I replaced it with the standard function. - BlockBuilder::Add() had code to copy just the changed portion into the last key value (and asserted that it now matched the new key). It is more efficient just to copy the whole new key over. - Moved this same code up into the 'if (use_delta_encoding_)' since the last key value is only needed when delta encoding is on. - FlushBlockBySizePolicy::BlockAlmostFull calculated a standard deviation value each time it was called, but this information would only change if block_size of block_size_deviation changed, so I created a member variable to hold the value to avoid the calculation each time. - Each PutVarint??() function has a buffer and calls std::string::append(). Two or three calls in a row could share a buffer and a single call to std::string::append(). Some of these will be helpful outside of the SstFileWriter. I'm not 100% the addition of the move constructor is appropriate as I wonder why this wasn't done before - maybe because of compiler compatibility? I tried it on gcc 4.8 and 4.9. Test Plan: The changes should not affect the results so the existing tests should all still work and no new tests were added. The value of the changes was seen by manually testing the SstFileWriter class through MyRocks and adding timing code to identify problem areas. Reviewers: sdong, IslamAbdelRahman Reviewed By: IslamAbdelRahman Subscribers: andrewkr, dhruba Differential Revision: https://reviews.facebook.net/D59607
2016-06-13 16:57:43 +00:00
PutVarint64Varint64(dst, offset_, size_);
}
char* BlockHandle::EncodeTo(char* dst) const {
// Sanity check that all fields have been set
assert(offset_ != ~uint64_t{0});
assert(size_ != ~uint64_t{0});
char* cur = EncodeVarint64(dst, offset_);
cur = EncodeVarint64(cur, size_);
return cur;
}
Status BlockHandle::DecodeFrom(Slice* input) {
if (GetVarint64(input, &offset_) && GetVarint64(input, &size_)) {
return Status::OK();
} else {
// reset in case failure after partially decoding
offset_ = 0;
size_ = 0;
return Status::Corruption("bad block handle");
}
}
Status BlockHandle::DecodeSizeFrom(uint64_t _offset, Slice* input) {
if (GetVarint64(input, &size_)) {
offset_ = _offset;
return Status::OK();
} else {
// reset in case failure after partially decoding
offset_ = 0;
size_ = 0;
return Status::Corruption("bad block handle");
}
}
// Return a string that contains the copy of handle.
std::string BlockHandle::ToString(bool hex) const {
std::string handle_str;
EncodeTo(&handle_str);
if (hex) {
return Slice(handle_str).ToString(true);
} else {
return handle_str;
}
}
const BlockHandle BlockHandle::kNullBlockHandle(0, 0);
Add an option to put first key of each sst block in the index (#5289) Summary: The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes. Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it. So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks. Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files. This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289 Differential Revision: D15256423 Pulled By: al13n321 fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-25 03:50:35 +00:00
void IndexValue::EncodeTo(std::string* dst, bool have_first_key,
const BlockHandle* previous_handle) const {
if (previous_handle) {
Improve / clean up meta block code & integrity (#9163) Summary: * Checksums are now checked on meta blocks unless specifically suppressed or not applicable (e.g. plain table). (Was other way around.) This means a number of cases that were not checking checksums now are, including direct read TableProperties in Version::GetTableProperties (fixed in meta_blocks ReadTableProperties), reading any block from PersistentCache (fixed in BlockFetcher), read TableProperties in SstFileDumper (ldb/sst_dump/BackupEngine) before table reader open, maybe more. * For that to work, I moved the global_seqno+TableProperties checksum logic to the shared table/ code, because that is used by many utilies such as SstFileDumper. * Also for that to work, we have to know when we're dealing with a block that has a checksum (trailer), so added that capability to Footer based on magic number, and from there BlockFetcher. * Knowledge of trailer presence has also fixed a problem where other table formats were reading blocks including bytes for a non-existant trailer--and awkwardly kind-of not using them, e.g. no shared code checking checksums. (BlockFetcher compression type was populated incorrectly.) Now we only read what is needed. * Minimized code duplication and differing/incompatible/awkward abstractions in meta_blocks.{cc,h} (e.g. SeekTo in metaindex block without parsing block handle) * Moved some meta block handling code from table_properties*.* * Moved some code specific to block-based table from shared table/ code to BlockBasedTable class. The checksum stuff means we can't completely separate it, but things that don't need to be in shared table/ code should not be. * Use unique_ptr rather than raw ptr in more places. (Note: you can std::move from unique_ptr to shared_ptr.) Without enhancements to GetPropertiesOfAllTablesTest (see below), net reduction of roughly 100 lines of code. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9163 Test Plan: existing tests and * Enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to verify that checksums are now checked on direct read of table properties by TableCache (new test would fail before this change) * Also enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to test putting table properties under old meta name * Also generally enhanced that same test to actually test what it was supposed to be testing already, by kicking things out of table cache when we don't want them there. Reviewed By: ajkr, mrambacher Differential Revision: D32514757 Pulled By: pdillinger fbshipit-source-id: 507964b9311d186ae8d1131182290cbd97a99fa9
2021-11-18 19:42:12 +00:00
// WART: this is specific to Block-based table
Add an option to put first key of each sst block in the index (#5289) Summary: The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes. Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it. So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks. Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files. This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289 Differential Revision: D15256423 Pulled By: al13n321 fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-25 03:50:35 +00:00
assert(handle.offset() == previous_handle->offset() +
Improve / clean up meta block code & integrity (#9163) Summary: * Checksums are now checked on meta blocks unless specifically suppressed or not applicable (e.g. plain table). (Was other way around.) This means a number of cases that were not checking checksums now are, including direct read TableProperties in Version::GetTableProperties (fixed in meta_blocks ReadTableProperties), reading any block from PersistentCache (fixed in BlockFetcher), read TableProperties in SstFileDumper (ldb/sst_dump/BackupEngine) before table reader open, maybe more. * For that to work, I moved the global_seqno+TableProperties checksum logic to the shared table/ code, because that is used by many utilies such as SstFileDumper. * Also for that to work, we have to know when we're dealing with a block that has a checksum (trailer), so added that capability to Footer based on magic number, and from there BlockFetcher. * Knowledge of trailer presence has also fixed a problem where other table formats were reading blocks including bytes for a non-existant trailer--and awkwardly kind-of not using them, e.g. no shared code checking checksums. (BlockFetcher compression type was populated incorrectly.) Now we only read what is needed. * Minimized code duplication and differing/incompatible/awkward abstractions in meta_blocks.{cc,h} (e.g. SeekTo in metaindex block without parsing block handle) * Moved some meta block handling code from table_properties*.* * Moved some code specific to block-based table from shared table/ code to BlockBasedTable class. The checksum stuff means we can't completely separate it, but things that don't need to be in shared table/ code should not be. * Use unique_ptr rather than raw ptr in more places. (Note: you can std::move from unique_ptr to shared_ptr.) Without enhancements to GetPropertiesOfAllTablesTest (see below), net reduction of roughly 100 lines of code. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9163 Test Plan: existing tests and * Enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to verify that checksums are now checked on direct read of table properties by TableCache (new test would fail before this change) * Also enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to test putting table properties under old meta name * Also generally enhanced that same test to actually test what it was supposed to be testing already, by kicking things out of table cache when we don't want them there. Reviewed By: ajkr, mrambacher Differential Revision: D32514757 Pulled By: pdillinger fbshipit-source-id: 507964b9311d186ae8d1131182290cbd97a99fa9
2021-11-18 19:42:12 +00:00
previous_handle->size() +
BlockBasedTable::kBlockTrailerSize);
Add an option to put first key of each sst block in the index (#5289) Summary: The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes. Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it. So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks. Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files. This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289 Differential Revision: D15256423 Pulled By: al13n321 fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-25 03:50:35 +00:00
PutVarsignedint64(dst, handle.size() - previous_handle->size());
} else {
handle.EncodeTo(dst);
}
assert(dst->size() != 0);
if (have_first_key) {
PutLengthPrefixedSlice(dst, first_internal_key);
}
}
Status IndexValue::DecodeFrom(Slice* input, bool have_first_key,
const BlockHandle* previous_handle) {
if (previous_handle) {
int64_t delta;
if (!GetVarsignedint64(input, &delta)) {
return Status::Corruption("bad delta-encoded index value");
}
Improve / clean up meta block code & integrity (#9163) Summary: * Checksums are now checked on meta blocks unless specifically suppressed or not applicable (e.g. plain table). (Was other way around.) This means a number of cases that were not checking checksums now are, including direct read TableProperties in Version::GetTableProperties (fixed in meta_blocks ReadTableProperties), reading any block from PersistentCache (fixed in BlockFetcher), read TableProperties in SstFileDumper (ldb/sst_dump/BackupEngine) before table reader open, maybe more. * For that to work, I moved the global_seqno+TableProperties checksum logic to the shared table/ code, because that is used by many utilies such as SstFileDumper. * Also for that to work, we have to know when we're dealing with a block that has a checksum (trailer), so added that capability to Footer based on magic number, and from there BlockFetcher. * Knowledge of trailer presence has also fixed a problem where other table formats were reading blocks including bytes for a non-existant trailer--and awkwardly kind-of not using them, e.g. no shared code checking checksums. (BlockFetcher compression type was populated incorrectly.) Now we only read what is needed. * Minimized code duplication and differing/incompatible/awkward abstractions in meta_blocks.{cc,h} (e.g. SeekTo in metaindex block without parsing block handle) * Moved some meta block handling code from table_properties*.* * Moved some code specific to block-based table from shared table/ code to BlockBasedTable class. The checksum stuff means we can't completely separate it, but things that don't need to be in shared table/ code should not be. * Use unique_ptr rather than raw ptr in more places. (Note: you can std::move from unique_ptr to shared_ptr.) Without enhancements to GetPropertiesOfAllTablesTest (see below), net reduction of roughly 100 lines of code. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9163 Test Plan: existing tests and * Enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to verify that checksums are now checked on direct read of table properties by TableCache (new test would fail before this change) * Also enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to test putting table properties under old meta name * Also generally enhanced that same test to actually test what it was supposed to be testing already, by kicking things out of table cache when we don't want them there. Reviewed By: ajkr, mrambacher Differential Revision: D32514757 Pulled By: pdillinger fbshipit-source-id: 507964b9311d186ae8d1131182290cbd97a99fa9
2021-11-18 19:42:12 +00:00
// WART: this is specific to Block-based table
handle = BlockHandle(previous_handle->offset() + previous_handle->size() +
BlockBasedTable::kBlockTrailerSize,
previous_handle->size() + delta);
Add an option to put first key of each sst block in the index (#5289) Summary: The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes. Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it. So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks. Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files. This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289 Differential Revision: D15256423 Pulled By: al13n321 fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-25 03:50:35 +00:00
} else {
Status s = handle.DecodeFrom(input);
if (!s.ok()) {
return s;
}
}
if (!have_first_key) {
first_internal_key = Slice();
} else if (!GetLengthPrefixedSlice(input, &first_internal_key)) {
return Status::Corruption("bad first key in block info");
}
return Status::OK();
}
std::string IndexValue::ToString(bool hex, bool have_first_key) const {
std::string s;
EncodeTo(&s, have_first_key, nullptr);
if (hex) {
return Slice(s).ToString(true);
} else {
return s;
}
}
namespace {
inline bool IsLegacyFooterFormat(uint64_t magic_number) {
return magic_number == kLegacyBlockBasedTableMagicNumber ||
magic_number == kLegacyPlainTableMagicNumber;
}
inline uint64_t UpconvertLegacyFooterFormat(uint64_t magic_number) {
if (magic_number == kLegacyBlockBasedTableMagicNumber) {
return kBlockBasedTableMagicNumber;
}
if (magic_number == kLegacyPlainTableMagicNumber) {
return kPlainTableMagicNumber;
}
assert(false);
return magic_number;
}
inline uint64_t DownconvertToLegacyFooterFormat(uint64_t magic_number) {
if (magic_number == kBlockBasedTableMagicNumber) {
return kLegacyBlockBasedTableMagicNumber;
}
if (magic_number == kPlainTableMagicNumber) {
return kLegacyPlainTableMagicNumber;
}
assert(false);
return magic_number;
}
inline uint8_t BlockTrailerSizeForMagicNumber(uint64_t magic_number) {
Improve / clean up meta block code & integrity (#9163) Summary: * Checksums are now checked on meta blocks unless specifically suppressed or not applicable (e.g. plain table). (Was other way around.) This means a number of cases that were not checking checksums now are, including direct read TableProperties in Version::GetTableProperties (fixed in meta_blocks ReadTableProperties), reading any block from PersistentCache (fixed in BlockFetcher), read TableProperties in SstFileDumper (ldb/sst_dump/BackupEngine) before table reader open, maybe more. * For that to work, I moved the global_seqno+TableProperties checksum logic to the shared table/ code, because that is used by many utilies such as SstFileDumper. * Also for that to work, we have to know when we're dealing with a block that has a checksum (trailer), so added that capability to Footer based on magic number, and from there BlockFetcher. * Knowledge of trailer presence has also fixed a problem where other table formats were reading blocks including bytes for a non-existant trailer--and awkwardly kind-of not using them, e.g. no shared code checking checksums. (BlockFetcher compression type was populated incorrectly.) Now we only read what is needed. * Minimized code duplication and differing/incompatible/awkward abstractions in meta_blocks.{cc,h} (e.g. SeekTo in metaindex block without parsing block handle) * Moved some meta block handling code from table_properties*.* * Moved some code specific to block-based table from shared table/ code to BlockBasedTable class. The checksum stuff means we can't completely separate it, but things that don't need to be in shared table/ code should not be. * Use unique_ptr rather than raw ptr in more places. (Note: you can std::move from unique_ptr to shared_ptr.) Without enhancements to GetPropertiesOfAllTablesTest (see below), net reduction of roughly 100 lines of code. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9163 Test Plan: existing tests and * Enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to verify that checksums are now checked on direct read of table properties by TableCache (new test would fail before this change) * Also enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to test putting table properties under old meta name * Also generally enhanced that same test to actually test what it was supposed to be testing already, by kicking things out of table cache when we don't want them there. Reviewed By: ajkr, mrambacher Differential Revision: D32514757 Pulled By: pdillinger fbshipit-source-id: 507964b9311d186ae8d1131182290cbd97a99fa9
2021-11-18 19:42:12 +00:00
if (magic_number == kBlockBasedTableMagicNumber ||
magic_number == kLegacyBlockBasedTableMagicNumber) {
return static_cast<uint8_t>(BlockBasedTable::kBlockTrailerSize);
Improve / clean up meta block code & integrity (#9163) Summary: * Checksums are now checked on meta blocks unless specifically suppressed or not applicable (e.g. plain table). (Was other way around.) This means a number of cases that were not checking checksums now are, including direct read TableProperties in Version::GetTableProperties (fixed in meta_blocks ReadTableProperties), reading any block from PersistentCache (fixed in BlockFetcher), read TableProperties in SstFileDumper (ldb/sst_dump/BackupEngine) before table reader open, maybe more. * For that to work, I moved the global_seqno+TableProperties checksum logic to the shared table/ code, because that is used by many utilies such as SstFileDumper. * Also for that to work, we have to know when we're dealing with a block that has a checksum (trailer), so added that capability to Footer based on magic number, and from there BlockFetcher. * Knowledge of trailer presence has also fixed a problem where other table formats were reading blocks including bytes for a non-existant trailer--and awkwardly kind-of not using them, e.g. no shared code checking checksums. (BlockFetcher compression type was populated incorrectly.) Now we only read what is needed. * Minimized code duplication and differing/incompatible/awkward abstractions in meta_blocks.{cc,h} (e.g. SeekTo in metaindex block without parsing block handle) * Moved some meta block handling code from table_properties*.* * Moved some code specific to block-based table from shared table/ code to BlockBasedTable class. The checksum stuff means we can't completely separate it, but things that don't need to be in shared table/ code should not be. * Use unique_ptr rather than raw ptr in more places. (Note: you can std::move from unique_ptr to shared_ptr.) Without enhancements to GetPropertiesOfAllTablesTest (see below), net reduction of roughly 100 lines of code. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9163 Test Plan: existing tests and * Enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to verify that checksums are now checked on direct read of table properties by TableCache (new test would fail before this change) * Also enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to test putting table properties under old meta name * Also generally enhanced that same test to actually test what it was supposed to be testing already, by kicking things out of table cache when we don't want them there. Reviewed By: ajkr, mrambacher Differential Revision: D32514757 Pulled By: pdillinger fbshipit-source-id: 507964b9311d186ae8d1131182290cbd97a99fa9
2021-11-18 19:42:12 +00:00
} else {
return 0;
Improve / clean up meta block code & integrity (#9163) Summary: * Checksums are now checked on meta blocks unless specifically suppressed or not applicable (e.g. plain table). (Was other way around.) This means a number of cases that were not checking checksums now are, including direct read TableProperties in Version::GetTableProperties (fixed in meta_blocks ReadTableProperties), reading any block from PersistentCache (fixed in BlockFetcher), read TableProperties in SstFileDumper (ldb/sst_dump/BackupEngine) before table reader open, maybe more. * For that to work, I moved the global_seqno+TableProperties checksum logic to the shared table/ code, because that is used by many utilies such as SstFileDumper. * Also for that to work, we have to know when we're dealing with a block that has a checksum (trailer), so added that capability to Footer based on magic number, and from there BlockFetcher. * Knowledge of trailer presence has also fixed a problem where other table formats were reading blocks including bytes for a non-existant trailer--and awkwardly kind-of not using them, e.g. no shared code checking checksums. (BlockFetcher compression type was populated incorrectly.) Now we only read what is needed. * Minimized code duplication and differing/incompatible/awkward abstractions in meta_blocks.{cc,h} (e.g. SeekTo in metaindex block without parsing block handle) * Moved some meta block handling code from table_properties*.* * Moved some code specific to block-based table from shared table/ code to BlockBasedTable class. The checksum stuff means we can't completely separate it, but things that don't need to be in shared table/ code should not be. * Use unique_ptr rather than raw ptr in more places. (Note: you can std::move from unique_ptr to shared_ptr.) Without enhancements to GetPropertiesOfAllTablesTest (see below), net reduction of roughly 100 lines of code. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9163 Test Plan: existing tests and * Enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to verify that checksums are now checked on direct read of table properties by TableCache (new test would fail before this change) * Also enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to test putting table properties under old meta name * Also generally enhanced that same test to actually test what it was supposed to be testing already, by kicking things out of table cache when we don't want them there. Reviewed By: ajkr, mrambacher Differential Revision: D32514757 Pulled By: pdillinger fbshipit-source-id: 507964b9311d186ae8d1131182290cbd97a99fa9
2021-11-18 19:42:12 +00:00
}
}
// Footer format, in three parts:
// * Part1
// -> format_version == 0 (inferred from legacy magic number)
// <empty> (0 bytes)
// -> format_version >= 1
// checksum type (char, 1 byte)
// * Part2
// metaindex handle (varint64 offset, varint64 size)
// index handle (varint64 offset, varint64 size)
// <zero padding> for part2 size = 2 * BlockHandle::kMaxEncodedLength = 40
// * Part3
// -> format_version == 0 (inferred from legacy magic number)
// legacy magic number (8 bytes)
// -> format_version >= 1 (inferred from NOT legacy magic number)
// format_version (uint32LE, 4 bytes), also called "footer version"
// newer magic number (8 bytes)
constexpr size_t kFooterPart2Size = 2 * BlockHandle::kMaxEncodedLength;
} // namespace
void FooterBuilder::Build(uint64_t magic_number, uint32_t format_version,
uint64_t footer_offset, ChecksumType checksum_type,
const BlockHandle& metaindex_handle,
const BlockHandle& index_handle) {
(void)footer_offset; // Future use
assert(magic_number != Footer::kNullTableMagicNumber);
assert(IsSupportedFormatVersion(format_version));
char* part2;
char* part3;
if (format_version > 0) {
slice_ = Slice(data_.data(), Footer::kNewVersionsEncodedLength);
// Generate parts 1 and 3
char* cur = data_.data();
// Part 1
*(cur++) = checksum_type;
// Part 2
part2 = cur;
// Skip over part 2 for now
cur += kFooterPart2Size;
// Part 3
part3 = cur;
EncodeFixed32(cur, format_version);
cur += 4;
EncodeFixed64(cur, magic_number);
assert(cur + 8 == slice_.data() + slice_.size());
} else {
slice_ = Slice(data_.data(), Footer::kVersion0EncodedLength);
// Legacy SST files use kCRC32c checksum but it's not stored in footer.
assert(checksum_type == kNoChecksum || checksum_type == kCRC32c);
// Generate part 3 (part 1 empty, skip part 2 for now)
part2 = data_.data();
part3 = part2 + kFooterPart2Size;
char* cur = part3;
// Use legacy magic numbers to indicate format_version=0, for
// compatibility. No other cases should use format_version=0.
EncodeFixed64(cur, DownconvertToLegacyFooterFormat(magic_number));
assert(cur + 8 == slice_.data() + slice_.size());
}
{
char* cur = part2;
cur = metaindex_handle.EncodeTo(cur);
cur = index_handle.EncodeTo(cur);
// Zero pad remainder
std::fill(cur, part3, char{0});
}
}
Status Footer::DecodeFrom(Slice input, uint64_t input_offset) {
(void)input_offset; // Future use
// Only decode to unused Footer
assert(table_magic_number_ == kNullTableMagicNumber);
assert(input != nullptr);
assert(input.size() >= kMinEncodedLength);
const char* magic_ptr = input.data() + input.size() - kMagicNumberLengthByte;
uint64_t magic = DecodeFixed64(magic_ptr);
// We check for legacy formats here and silently upconvert them
bool legacy = IsLegacyFooterFormat(magic);
if (legacy) {
magic = UpconvertLegacyFooterFormat(magic);
}
table_magic_number_ = magic;
block_trailer_size_ = BlockTrailerSizeForMagicNumber(magic);
// Parse Part3
if (legacy) {
// The size is already asserted to be at least kMinEncodedLength
// at the beginning of the function
input.remove_prefix(input.size() - kVersion0EncodedLength);
format_version_ = 0 /* legacy */;
checksum_type_ = kCRC32c;
} else {
const char* part3_ptr = magic_ptr - 4;
format_version_ = DecodeFixed32(part3_ptr);
if (!IsSupportedFormatVersion(format_version_)) {
return Status::Corruption("Corrupt or unsupported format_version: " +
std::to_string(format_version_));
}
// All known format versions >= 1 occupy exactly this many bytes.
if (input.size() < kNewVersionsEncodedLength) {
return Status::Corruption("Input is too short to be an SST file");
}
uint64_t adjustment = input.size() - kNewVersionsEncodedLength;
input.remove_prefix(adjustment);
// Parse Part1
char chksum = input.data()[0];
checksum_type_ = lossless_cast<ChecksumType>(chksum);
if (!IsSupportedChecksumType(checksum_type())) {
return Status::Corruption("Corrupt or unsupported checksum type: " +
std::to_string(lossless_cast<uint8_t>(chksum)));
}
// Consume checksum type field
input.remove_prefix(1);
}
// Parse Part2
Status result = metaindex_handle_.DecodeFrom(&input);
if (result.ok()) {
result = index_handle_.DecodeFrom(&input);
}
return result;
// Padding in part2 is ignored
}
std::string Footer::ToString() const {
std::string result;
result.reserve(1024);
bool legacy = IsLegacyFooterFormat(table_magic_number_);
if (legacy) {
result.append("metaindex handle: " + metaindex_handle_.ToString() + "\n ");
result.append("index handle: " + index_handle_.ToString() + "\n ");
result.append("table_magic_number: " + std::to_string(table_magic_number_) +
"\n ");
} else {
result.append("metaindex handle: " + metaindex_handle_.ToString() + "\n ");
result.append("index handle: " + index_handle_.ToString() + "\n ");
result.append("table_magic_number: " + std::to_string(table_magic_number_) +
"\n ");
result.append("format version: " + std::to_string(format_version_) +
"\n ");
}
return result;
}
Status ReadFooterFromFile(const IOOptions& opts, RandomAccessFileReader* file,
FilePrefetchBuffer* prefetch_buffer,
uint64_t file_size, Footer* footer,
uint64_t enforce_table_magic_number) {
if (file_size < Footer::kMinEncodedLength) {
return Status::Corruption("file is too short (" +
std::to_string(file_size) +
" bytes) to be an "
"sstable: " +
file->file_name());
}
std::string footer_buf;
AlignedBuf internal_buf;
Slice footer_input;
uint64_t read_offset = (file_size > Footer::kMaxEncodedLength)
? file_size - Footer::kMaxEncodedLength
: 0;
Status s;
// TODO: Need to pass appropriate deadline to TryReadFromCache(). Right now,
// there is no readahead for point lookups, so TryReadFromCache will fail if
// the required data is not in the prefetch buffer. Once deadline is enabled
// for iterator, TryReadFromCache might do a readahead. Revisit to see if we
// need to pass a timeout at that point
Add rate limiter priority to ReadOptions (#9424) Summary: Users can set the priority for file reads associated with their operation by setting `ReadOptions::rate_limiter_priority` to something other than `Env::IO_TOTAL`. Rate limiting `VerifyChecksum()` and `VerifyFileChecksums()` is the motivation for this PR, so it also includes benchmarks and minor bug fixes to get that working. `RandomAccessFileReader::Read()` already had support for rate limiting compaction reads. I changed that rate limiting to be non-specific to compaction, but rather performed according to the passed in `Env::IOPriority`. Now the compaction read rate limiting is supported by setting `rate_limiter_priority = Env::IO_LOW` on its `ReadOptions`. There is no default value for the new `Env::IOPriority` parameter to `RandomAccessFileReader::Read()`. That means this PR goes through all callers (in some cases multiple layers up the call stack) to find a `ReadOptions` to provide the priority. There are TODOs for cases I believe it would be good to let user control the priority some day (e.g., file footer reads), and no TODO in cases I believe it doesn't matter (e.g., trace file reads). The API doc only lists the missing cases where a file read associated with a provided `ReadOptions` cannot be rate limited. For cases like file ingestion checksum calculation, there is no API to provide `ReadOptions` or `Env::IOPriority`, so I didn't count that as missing. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9424 Test Plan: - new unit tests - new benchmarks on ~50MB database with 1MB/s read rate limit and 100ms refill interval; verified with strace reads are chunked (at 0.1MB per chunk) and spaced roughly 100ms apart. - setup command: `./db_bench -benchmarks=fillrandom,compact -db=/tmp/testdb -target_file_size_base=1048576 -disable_auto_compactions=true -file_checksum=true` - benchmarks command: `strace -ttfe pread64 ./db_bench -benchmarks=verifychecksum,verifyfilechecksums -use_existing_db=true -db=/tmp/testdb -rate_limiter_bytes_per_sec=1048576 -rate_limit_bg_reads=1 -rate_limit_user_ops=true -file_checksum=true` - crash test using IO_USER priority on non-validation reads with https://github.com/facebook/rocksdb/issues/9567 reverted: `python3 tools/db_crashtest.py blackbox --max_key=1000000 --write_buffer_size=524288 --target_file_size_base=524288 --level_compaction_dynamic_level_bytes=true --duration=3600 --rate_limit_bg_reads=true --rate_limit_user_ops=true --rate_limiter_bytes_per_sec=10485760 --interval=10` Reviewed By: hx235 Differential Revision: D33747386 Pulled By: ajkr fbshipit-source-id: a2d985e97912fba8c54763798e04f006ccc56e0c
2022-02-17 07:17:03 +00:00
// TODO: rate limit footer reads.
if (prefetch_buffer == nullptr ||
Add rate limiter priority to ReadOptions (#9424) Summary: Users can set the priority for file reads associated with their operation by setting `ReadOptions::rate_limiter_priority` to something other than `Env::IO_TOTAL`. Rate limiting `VerifyChecksum()` and `VerifyFileChecksums()` is the motivation for this PR, so it also includes benchmarks and minor bug fixes to get that working. `RandomAccessFileReader::Read()` already had support for rate limiting compaction reads. I changed that rate limiting to be non-specific to compaction, but rather performed according to the passed in `Env::IOPriority`. Now the compaction read rate limiting is supported by setting `rate_limiter_priority = Env::IO_LOW` on its `ReadOptions`. There is no default value for the new `Env::IOPriority` parameter to `RandomAccessFileReader::Read()`. That means this PR goes through all callers (in some cases multiple layers up the call stack) to find a `ReadOptions` to provide the priority. There are TODOs for cases I believe it would be good to let user control the priority some day (e.g., file footer reads), and no TODO in cases I believe it doesn't matter (e.g., trace file reads). The API doc only lists the missing cases where a file read associated with a provided `ReadOptions` cannot be rate limited. For cases like file ingestion checksum calculation, there is no API to provide `ReadOptions` or `Env::IOPriority`, so I didn't count that as missing. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9424 Test Plan: - new unit tests - new benchmarks on ~50MB database with 1MB/s read rate limit and 100ms refill interval; verified with strace reads are chunked (at 0.1MB per chunk) and spaced roughly 100ms apart. - setup command: `./db_bench -benchmarks=fillrandom,compact -db=/tmp/testdb -target_file_size_base=1048576 -disable_auto_compactions=true -file_checksum=true` - benchmarks command: `strace -ttfe pread64 ./db_bench -benchmarks=verifychecksum,verifyfilechecksums -use_existing_db=true -db=/tmp/testdb -rate_limiter_bytes_per_sec=1048576 -rate_limit_bg_reads=1 -rate_limit_user_ops=true -file_checksum=true` - crash test using IO_USER priority on non-validation reads with https://github.com/facebook/rocksdb/issues/9567 reverted: `python3 tools/db_crashtest.py blackbox --max_key=1000000 --write_buffer_size=524288 --target_file_size_base=524288 --level_compaction_dynamic_level_bytes=true --duration=3600 --rate_limit_bg_reads=true --rate_limit_user_ops=true --rate_limiter_bytes_per_sec=10485760 --interval=10` Reviewed By: hx235 Differential Revision: D33747386 Pulled By: ajkr fbshipit-source-id: a2d985e97912fba8c54763798e04f006ccc56e0c
2022-02-17 07:17:03 +00:00
!prefetch_buffer->TryReadFromCache(
Set Read rate limiter priority dynamically and pass it to FS (#9996) Summary: ### Context: Background compactions and flush generate large reads and writes, and can be long running, especially for universal compaction. In some cases, this can impact foreground reads and writes by users. ### Solution User, Flush, and Compaction reads share some code path. For this task, we update the rate_limiter_priority in ReadOptions for code paths (e.g. FindTable (mainly in BlockBasedTable::Open()) and various iterators), and eventually update the rate_limiter_priority in IOOptions for FSRandomAccessFile. **This PR is for the Read path.** The **Read:** dynamic priority for different state are listed as follows: | State | Normal | Delayed | Stalled | | ----- | ------ | ------- | ------- | | Flush (verification read in BuildTable()) | IO_USER | IO_USER | IO_USER | | Compaction | IO_LOW | IO_USER | IO_USER | | User | User provided | User provided | User provided | We will respect the read_options that the user provided and will not set it. The only sst read for Flush is the verification read in BuildTable(). It claims to be "regard as user read". **Details** 1. Set read_options.rate_limiter_priority dynamically: - User: Do not update the read_options. Use the read_options that the user provided. - Compaction: Update read_options in CompactionJob::ProcessKeyValueCompaction(). - Flush: Update read_options in BuildTable(). 2. Pass the rate limiter priority to FSRandomAccessFile functions: - After calling the FindTable(), read_options is passed through GetTableReader(table_cache.cc), BlockBasedTableFactory::NewTableReader(block_based_table_factory.cc), and BlockBasedTable::Open(). The Open() needs some updates for the ReadOptions variable and the updates are also needed for the called functions, including PrefetchTail(), PrepareIOOptions(), ReadFooterFromFile(), ReadMetaIndexblock(), ReadPropertiesBlock(), PrefetchIndexAndFilterBlocks(), and ReadRangeDelBlock(). - In RandomAccessFileReader, the functions to be updated include Read(), MultiRead(), ReadAsync(), and Prefetch(). - Update the downstream functions of NewIndexIterator(), NewDataBlockIterator(), and BlockBasedTableIterator(). ### Test Plans Add unit tests. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9996 Reviewed By: anand1976 Differential Revision: D36452483 Pulled By: gitbw95 fbshipit-source-id: 60978204a4f849bb9261cb78d9bc1cb56d6008cf
2022-05-19 02:41:44 +00:00
opts, file, read_offset, Footer::kMaxEncodedLength, &footer_input,
nullptr, opts.rate_limiter_priority)) {
if (file->use_direct_io()) {
s = file->Read(opts, read_offset, Footer::kMaxEncodedLength,
Add rate limiter priority to ReadOptions (#9424) Summary: Users can set the priority for file reads associated with their operation by setting `ReadOptions::rate_limiter_priority` to something other than `Env::IO_TOTAL`. Rate limiting `VerifyChecksum()` and `VerifyFileChecksums()` is the motivation for this PR, so it also includes benchmarks and minor bug fixes to get that working. `RandomAccessFileReader::Read()` already had support for rate limiting compaction reads. I changed that rate limiting to be non-specific to compaction, but rather performed according to the passed in `Env::IOPriority`. Now the compaction read rate limiting is supported by setting `rate_limiter_priority = Env::IO_LOW` on its `ReadOptions`. There is no default value for the new `Env::IOPriority` parameter to `RandomAccessFileReader::Read()`. That means this PR goes through all callers (in some cases multiple layers up the call stack) to find a `ReadOptions` to provide the priority. There are TODOs for cases I believe it would be good to let user control the priority some day (e.g., file footer reads), and no TODO in cases I believe it doesn't matter (e.g., trace file reads). The API doc only lists the missing cases where a file read associated with a provided `ReadOptions` cannot be rate limited. For cases like file ingestion checksum calculation, there is no API to provide `ReadOptions` or `Env::IOPriority`, so I didn't count that as missing. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9424 Test Plan: - new unit tests - new benchmarks on ~50MB database with 1MB/s read rate limit and 100ms refill interval; verified with strace reads are chunked (at 0.1MB per chunk) and spaced roughly 100ms apart. - setup command: `./db_bench -benchmarks=fillrandom,compact -db=/tmp/testdb -target_file_size_base=1048576 -disable_auto_compactions=true -file_checksum=true` - benchmarks command: `strace -ttfe pread64 ./db_bench -benchmarks=verifychecksum,verifyfilechecksums -use_existing_db=true -db=/tmp/testdb -rate_limiter_bytes_per_sec=1048576 -rate_limit_bg_reads=1 -rate_limit_user_ops=true -file_checksum=true` - crash test using IO_USER priority on non-validation reads with https://github.com/facebook/rocksdb/issues/9567 reverted: `python3 tools/db_crashtest.py blackbox --max_key=1000000 --write_buffer_size=524288 --target_file_size_base=524288 --level_compaction_dynamic_level_bytes=true --duration=3600 --rate_limit_bg_reads=true --rate_limit_user_ops=true --rate_limiter_bytes_per_sec=10485760 --interval=10` Reviewed By: hx235 Differential Revision: D33747386 Pulled By: ajkr fbshipit-source-id: a2d985e97912fba8c54763798e04f006ccc56e0c
2022-02-17 07:17:03 +00:00
&footer_input, nullptr, &internal_buf,
Set Read rate limiter priority dynamically and pass it to FS (#9996) Summary: ### Context: Background compactions and flush generate large reads and writes, and can be long running, especially for universal compaction. In some cases, this can impact foreground reads and writes by users. ### Solution User, Flush, and Compaction reads share some code path. For this task, we update the rate_limiter_priority in ReadOptions for code paths (e.g. FindTable (mainly in BlockBasedTable::Open()) and various iterators), and eventually update the rate_limiter_priority in IOOptions for FSRandomAccessFile. **This PR is for the Read path.** The **Read:** dynamic priority for different state are listed as follows: | State | Normal | Delayed | Stalled | | ----- | ------ | ------- | ------- | | Flush (verification read in BuildTable()) | IO_USER | IO_USER | IO_USER | | Compaction | IO_LOW | IO_USER | IO_USER | | User | User provided | User provided | User provided | We will respect the read_options that the user provided and will not set it. The only sst read for Flush is the verification read in BuildTable(). It claims to be "regard as user read". **Details** 1. Set read_options.rate_limiter_priority dynamically: - User: Do not update the read_options. Use the read_options that the user provided. - Compaction: Update read_options in CompactionJob::ProcessKeyValueCompaction(). - Flush: Update read_options in BuildTable(). 2. Pass the rate limiter priority to FSRandomAccessFile functions: - After calling the FindTable(), read_options is passed through GetTableReader(table_cache.cc), BlockBasedTableFactory::NewTableReader(block_based_table_factory.cc), and BlockBasedTable::Open(). The Open() needs some updates for the ReadOptions variable and the updates are also needed for the called functions, including PrefetchTail(), PrepareIOOptions(), ReadFooterFromFile(), ReadMetaIndexblock(), ReadPropertiesBlock(), PrefetchIndexAndFilterBlocks(), and ReadRangeDelBlock(). - In RandomAccessFileReader, the functions to be updated include Read(), MultiRead(), ReadAsync(), and Prefetch(). - Update the downstream functions of NewIndexIterator(), NewDataBlockIterator(), and BlockBasedTableIterator(). ### Test Plans Add unit tests. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9996 Reviewed By: anand1976 Differential Revision: D36452483 Pulled By: gitbw95 fbshipit-source-id: 60978204a4f849bb9261cb78d9bc1cb56d6008cf
2022-05-19 02:41:44 +00:00
opts.rate_limiter_priority);
} else {
footer_buf.reserve(Footer::kMaxEncodedLength);
s = file->Read(opts, read_offset, Footer::kMaxEncodedLength,
Add rate limiter priority to ReadOptions (#9424) Summary: Users can set the priority for file reads associated with their operation by setting `ReadOptions::rate_limiter_priority` to something other than `Env::IO_TOTAL`. Rate limiting `VerifyChecksum()` and `VerifyFileChecksums()` is the motivation for this PR, so it also includes benchmarks and minor bug fixes to get that working. `RandomAccessFileReader::Read()` already had support for rate limiting compaction reads. I changed that rate limiting to be non-specific to compaction, but rather performed according to the passed in `Env::IOPriority`. Now the compaction read rate limiting is supported by setting `rate_limiter_priority = Env::IO_LOW` on its `ReadOptions`. There is no default value for the new `Env::IOPriority` parameter to `RandomAccessFileReader::Read()`. That means this PR goes through all callers (in some cases multiple layers up the call stack) to find a `ReadOptions` to provide the priority. There are TODOs for cases I believe it would be good to let user control the priority some day (e.g., file footer reads), and no TODO in cases I believe it doesn't matter (e.g., trace file reads). The API doc only lists the missing cases where a file read associated with a provided `ReadOptions` cannot be rate limited. For cases like file ingestion checksum calculation, there is no API to provide `ReadOptions` or `Env::IOPriority`, so I didn't count that as missing. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9424 Test Plan: - new unit tests - new benchmarks on ~50MB database with 1MB/s read rate limit and 100ms refill interval; verified with strace reads are chunked (at 0.1MB per chunk) and spaced roughly 100ms apart. - setup command: `./db_bench -benchmarks=fillrandom,compact -db=/tmp/testdb -target_file_size_base=1048576 -disable_auto_compactions=true -file_checksum=true` - benchmarks command: `strace -ttfe pread64 ./db_bench -benchmarks=verifychecksum,verifyfilechecksums -use_existing_db=true -db=/tmp/testdb -rate_limiter_bytes_per_sec=1048576 -rate_limit_bg_reads=1 -rate_limit_user_ops=true -file_checksum=true` - crash test using IO_USER priority on non-validation reads with https://github.com/facebook/rocksdb/issues/9567 reverted: `python3 tools/db_crashtest.py blackbox --max_key=1000000 --write_buffer_size=524288 --target_file_size_base=524288 --level_compaction_dynamic_level_bytes=true --duration=3600 --rate_limit_bg_reads=true --rate_limit_user_ops=true --rate_limiter_bytes_per_sec=10485760 --interval=10` Reviewed By: hx235 Differential Revision: D33747386 Pulled By: ajkr fbshipit-source-id: a2d985e97912fba8c54763798e04f006ccc56e0c
2022-02-17 07:17:03 +00:00
&footer_input, &footer_buf[0], nullptr,
Set Read rate limiter priority dynamically and pass it to FS (#9996) Summary: ### Context: Background compactions and flush generate large reads and writes, and can be long running, especially for universal compaction. In some cases, this can impact foreground reads and writes by users. ### Solution User, Flush, and Compaction reads share some code path. For this task, we update the rate_limiter_priority in ReadOptions for code paths (e.g. FindTable (mainly in BlockBasedTable::Open()) and various iterators), and eventually update the rate_limiter_priority in IOOptions for FSRandomAccessFile. **This PR is for the Read path.** The **Read:** dynamic priority for different state are listed as follows: | State | Normal | Delayed | Stalled | | ----- | ------ | ------- | ------- | | Flush (verification read in BuildTable()) | IO_USER | IO_USER | IO_USER | | Compaction | IO_LOW | IO_USER | IO_USER | | User | User provided | User provided | User provided | We will respect the read_options that the user provided and will not set it. The only sst read for Flush is the verification read in BuildTable(). It claims to be "regard as user read". **Details** 1. Set read_options.rate_limiter_priority dynamically: - User: Do not update the read_options. Use the read_options that the user provided. - Compaction: Update read_options in CompactionJob::ProcessKeyValueCompaction(). - Flush: Update read_options in BuildTable(). 2. Pass the rate limiter priority to FSRandomAccessFile functions: - After calling the FindTable(), read_options is passed through GetTableReader(table_cache.cc), BlockBasedTableFactory::NewTableReader(block_based_table_factory.cc), and BlockBasedTable::Open(). The Open() needs some updates for the ReadOptions variable and the updates are also needed for the called functions, including PrefetchTail(), PrepareIOOptions(), ReadFooterFromFile(), ReadMetaIndexblock(), ReadPropertiesBlock(), PrefetchIndexAndFilterBlocks(), and ReadRangeDelBlock(). - In RandomAccessFileReader, the functions to be updated include Read(), MultiRead(), ReadAsync(), and Prefetch(). - Update the downstream functions of NewIndexIterator(), NewDataBlockIterator(), and BlockBasedTableIterator(). ### Test Plans Add unit tests. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9996 Reviewed By: anand1976 Differential Revision: D36452483 Pulled By: gitbw95 fbshipit-source-id: 60978204a4f849bb9261cb78d9bc1cb56d6008cf
2022-05-19 02:41:44 +00:00
opts.rate_limiter_priority);
}
if (!s.ok()) return s;
}
// Check that we actually read the whole footer from the file. It may be
// that size isn't correct.
if (footer_input.size() < Footer::kMinEncodedLength) {
Always verify SST unique IDs on SST file open (#10532) Summary: Although we've been tracking SST unique IDs in the DB manifest unconditionally, checking has been opt-in and with an extra pass at DB::Open time. This changes the behavior of `verify_sst_unique_id_in_manifest` to check unique ID against manifest every time an SST file is opened through table cache (normal DB operations), replacing the explicit pass over files at DB::Open time. This change also enables the option by default and removes the "EXPERIMENTAL" designation. One possible criticism is that the option no longer ensures the integrity of a DB at Open time. This is far from an all-or-nothing issue. Verifying the IDs of all SST files hardly ensures all the data in the DB is readable. (VerifyChecksum is supposed to do that.) Also, with max_open_files=-1 (default, extremely common), all SST files are opened at DB::Open time anyway. Implementation details: * `VerifySstUniqueIdInManifest()` functions are the extra/explicit pass that is now removed. * Unit tests that manipulate/corrupt table properties have to opt out of this check, because that corrupts the "actual" unique id. (And even for testing we don't currently have a mechanism to set "no unique id" in the in-memory file metadata for new files.) * A lot of other unit test churn relates to (a) default checking on, and (b) checking on SST open even without DB::Open (e.g. on flush) * Use `FileMetaData` for more `TableCache` operations (in place of `FileDescriptor`) so that we have access to the unique_id whenever we might need to open an SST file. **There is the possibility of performance impact because we can no longer use the more localized `fd` part of an `FdWithKeyRange` but instead follow the `file_metadata` pointer. However, this change (possible regression) is only done for `GetMemoryUsageByTableReaders`.** * Removed a completely unnecessary constructor overload of `TableReaderOptions` Possible follow-up: * Verification only happens when opening through table cache. Are there more places where this should happen? * Improve error message when there is a file size mismatch vs. manifest (FIXME added in the appropriate place). * I'm not sure there's a justification for `FileDescriptor` to be distinct from `FileMetaData`. * I'm skeptical that `FdWithKeyRange` really still makes sense for optimizing some data locality by duplicating some data in memory, but I could be wrong. * An unnecessary overload of NewTableReader was recently added, in the public API nonetheless (though unusable there). It should be cleaned up to put most things under `TableReaderOptions`. Pull Request resolved: https://github.com/facebook/rocksdb/pull/10532 Test Plan: updated unit tests Performance test showing no significant difference (just noise I think): `./db_bench -benchmarks=readwhilewriting[-X10] -num=3000000 -disable_wal=1 -bloom_bits=8 -write_buffer_size=1000000 -target_file_size_base=1000000` Before: readwhilewriting [AVG 10 runs] : 68702 (± 6932) ops/sec After: readwhilewriting [AVG 10 runs] : 68239 (± 7198) ops/sec Reviewed By: jay-zhuang Differential Revision: D38765551 Pulled By: pdillinger fbshipit-source-id: a827a708155f12344ab2a5c16e7701c7636da4c2
2022-09-08 05:52:42 +00:00
// FIXME: this error message is bad. We should be checking whether the
// provided file_size matches what's on disk, at least in this case.
// Unfortunately FileSystem/Env does not provide a way to get the size
// of an open file, so getting file size requires a full path seek.
return Status::Corruption("file is too short (" +
std::to_string(file_size) +
" bytes) to be an "
"sstable" +
file->file_name());
}
s = footer->DecodeFrom(footer_input, read_offset);
if (!s.ok()) {
return s;
}
if (enforce_table_magic_number != 0 &&
enforce_table_magic_number != footer->table_magic_number()) {
return Status::Corruption("Bad table magic number: expected " +
std::to_string(enforce_table_magic_number) +
", found " +
std::to_string(footer->table_magic_number()) +
" in " + file->file_name());
}
return Status::OK();
}
namespace {
// Custom handling for the last byte of a block, to avoid invoking streaming
// API to get an effective block checksum. This function is its own inverse
// because it uses xor.
inline uint32_t ModifyChecksumForLastByte(uint32_t checksum, char last_byte) {
// This strategy bears some resemblance to extending a CRC checksum by one
// more byte, except we don't need to re-mix the input checksum as long as
// we do this step only once (per checksum).
const uint32_t kRandomPrime = 0x6b9083d9;
return checksum ^ lossless_cast<uint8_t>(last_byte) * kRandomPrime;
}
} // namespace
uint32_t ComputeBuiltinChecksum(ChecksumType type, const char* data,
size_t data_size) {
switch (type) {
case kCRC32c:
return crc32c::Mask(crc32c::Value(data, data_size));
case kxxHash:
return XXH32(data, data_size, /*seed*/ 0);
case kxxHash64:
return Lower32of64(XXH64(data, data_size, /*seed*/ 0));
case kXXH3: {
if (data_size == 0) {
// Special case because of special handling for last byte, not
// present in this case. Can be any value different from other
// small input size checksums.
return 0;
} else {
// See corresponding code in ComputeBuiltinChecksumWithLastByte
uint32_t v = Lower32of64(XXH3_64bits(data, data_size - 1));
return ModifyChecksumForLastByte(v, data[data_size - 1]);
}
}
default: // including kNoChecksum
return 0;
}
}
uint32_t ComputeBuiltinChecksumWithLastByte(ChecksumType type, const char* data,
size_t data_size, char last_byte) {
switch (type) {
case kCRC32c: {
uint32_t crc = crc32c::Value(data, data_size);
// Extend to cover last byte (compression type)
crc = crc32c::Extend(crc, &last_byte, 1);
return crc32c::Mask(crc);
}
case kxxHash: {
XXH32_state_t* const state = XXH32_createState();
XXH32_reset(state, 0);
XXH32_update(state, data, data_size);
// Extend to cover last byte (compression type)
XXH32_update(state, &last_byte, 1);
uint32_t v = XXH32_digest(state);
XXH32_freeState(state);
return v;
}
case kxxHash64: {
XXH64_state_t* const state = XXH64_createState();
XXH64_reset(state, 0);
XXH64_update(state, data, data_size);
// Extend to cover last byte (compression type)
XXH64_update(state, &last_byte, 1);
uint32_t v = Lower32of64(XXH64_digest(state));
XXH64_freeState(state);
return v;
}
case kXXH3: {
// XXH3 is a complicated hash function that is extremely fast on
// contiguous input, but that makes its streaming support rather
// complex. It is worth custom handling of the last byte (`type`)
// in order to avoid allocating a large state object and bringing
// that code complexity into CPU working set.
uint32_t v = Lower32of64(XXH3_64bits(data, data_size));
return ModifyChecksumForLastByte(v, last_byte);
}
default: // including kNoChecksum
return 0;
}
}
Status UncompressBlockContentsForCompressionType(
const UncompressionInfo& uncompression_info, const char* data, size_t n,
BlockContents* contents, uint32_t format_version,
const ImmutableOptions& ioptions, MemoryAllocator* allocator) {
Status ret = Status::OK();
assert(uncompression_info.type() != kNoCompression &&
"Invalid compression type");
StopWatchNano timer(ioptions.clock,
ShouldReportDetailedTime(ioptions.env, ioptions.stats));
size_t uncompressed_size = 0;
CacheAllocationPtr ubuf =
UncompressData(uncompression_info, data, n, &uncompressed_size,
GetCompressFormatForVersion(format_version), allocator);
if (!ubuf) {
if (!CompressionTypeSupported(uncompression_info.type())) {
return Status::NotSupported(
"Unsupported compression method for this build",
CompressionTypeToString(uncompression_info.type()));
} else {
return Status::Corruption(
"Corrupted compressed block contents",
CompressionTypeToString(uncompression_info.type()));
}
}
*contents = BlockContents(std::move(ubuf), uncompressed_size);
if (ShouldReportDetailedTime(ioptions.env, ioptions.stats)) {
RecordTimeToHistogram(ioptions.stats, DECOMPRESSION_TIMES_NANOS,
timer.ElapsedNanos());
}
RecordTimeToHistogram(ioptions.stats, BYTES_DECOMPRESSED,
contents->data.size());
RecordTick(ioptions.stats, NUMBER_BLOCK_DECOMPRESSED);
TEST_SYNC_POINT_CALLBACK(
"UncompressBlockContentsForCompressionType:TamperWithReturnValue",
static_cast<void*>(&ret));
TEST_SYNC_POINT_CALLBACK(
"UncompressBlockContentsForCompressionType:"
"TamperWithDecompressionOutput",
static_cast<void*>(contents));
return ret;
}
//
// The 'data' points to the raw block contents that was read in from file.
// This method allocates a new heap buffer and the raw block
// contents are uncompresed into this buffer. This
// buffer is returned via 'result' and it is upto the caller to
// free this buffer.
// format_version is the block format as defined in include/rocksdb/table.h
Status UncompressBlockContents(const UncompressionInfo& uncompression_info,
const char* data, size_t n,
BlockContents* contents, uint32_t format_version,
const ImmutableOptions& ioptions,
MemoryAllocator* allocator) {
assert(data[n] != kNoCompression);
assert(data[n] == static_cast<char>(uncompression_info.type()));
return UncompressBlockContentsForCompressionType(uncompression_info, data, n,
contents, format_version,
ioptions, allocator);
}
// Replace the contents of db_host_id with the actual hostname, if db_host_id
// matches the keyword kHostnameForDbHostId
Status ReifyDbHostIdProperty(Env* env, std::string* db_host_id) {
assert(db_host_id);
if (*db_host_id == kHostnameForDbHostId) {
Status s = env->GetHostNameString(db_host_id);
if (!s.ok()) {
db_host_id->clear();
}
return s;
}
return Status::OK();
}
} // namespace ROCKSDB_NAMESPACE