2016-04-11 18:39:51 +00:00
|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
2017-07-15 23:03:42 +00:00
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
2016-04-11 18:39:51 +00:00
|
|
|
//
|
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
|
|
|
|
#include <cstring>
|
|
|
|
|
2020-09-14 23:59:00 +00:00
|
|
|
#include "options/cf_options.h"
|
|
|
|
#include "options/db_options.h"
|
2017-11-02 00:23:52 +00:00
|
|
|
#include "options/options_helper.h"
|
2016-04-11 18:39:51 +00:00
|
|
|
#include "rocksdb/convenience.h"
|
2019-05-30 18:21:38 +00:00
|
|
|
#include "test_util/testharness.h"
|
2016-04-11 18:39:51 +00:00
|
|
|
|
|
|
|
#ifndef GFLAGS
|
|
|
|
bool FLAGS_enable_print = false;
|
|
|
|
#else
|
2017-12-01 18:40:45 +00:00
|
|
|
#include "util/gflags_compat.h"
|
|
|
|
using GFLAGS_NAMESPACE::ParseCommandLineFlags;
|
2016-04-11 18:39:51 +00:00
|
|
|
DEFINE_bool(enable_print, false, "Print options generated to console.");
|
|
|
|
#endif // GFLAGS
|
|
|
|
|
2020-02-20 20:07:53 +00:00
|
|
|
namespace ROCKSDB_NAMESPACE {
|
2016-04-11 18:39:51 +00:00
|
|
|
|
|
|
|
// Verify options are settable from options strings.
|
|
|
|
// We take the approach that depends on compiler behavior that copy constructor
|
|
|
|
// won't touch implicit padding bytes, so that the test is fragile.
|
|
|
|
// As a result, we only run the tests to verify new fields in options are
|
|
|
|
// settable through string on limited platforms as it depends on behavior of
|
|
|
|
// compilers.
|
2017-04-27 19:19:55 +00:00
|
|
|
#if defined OS_LINUX || defined OS_WIN
|
2016-04-11 18:39:51 +00:00
|
|
|
#ifndef __clang__
|
2022-04-19 03:26:37 +00:00
|
|
|
#ifndef ROCKSDB_UBSAN_RUN
|
2016-04-11 18:39:51 +00:00
|
|
|
|
|
|
|
class OptionsSettableTest : public testing::Test {
|
|
|
|
public:
|
|
|
|
OptionsSettableTest() {}
|
|
|
|
};
|
|
|
|
|
|
|
|
const char kSpecialChar = 'z';
|
2021-09-07 18:31:12 +00:00
|
|
|
using OffsetGap = std::vector<std::pair<size_t, size_t>>;
|
2016-04-11 18:39:51 +00:00
|
|
|
|
|
|
|
void FillWithSpecialChar(char* start_ptr, size_t total_size,
|
2020-06-19 22:26:05 +00:00
|
|
|
const OffsetGap& excluded,
|
2020-04-08 21:37:01 +00:00
|
|
|
char special_char = kSpecialChar) {
|
2016-04-11 18:39:51 +00:00
|
|
|
size_t offset = 0;
|
2022-02-04 13:31:07 +00:00
|
|
|
// The excluded vector contains pairs of bytes, (first, second).
|
|
|
|
// The first bytes are all set to the special char (represented as 'c' below).
|
|
|
|
// The second bytes are simply skipped (padding bytes).
|
|
|
|
// ccccc[skipped]cccccccc[skiped]cccccccc[skipped]
|
2020-06-19 22:26:05 +00:00
|
|
|
for (auto& pair : excluded) {
|
2020-04-08 21:37:01 +00:00
|
|
|
std::memset(start_ptr + offset, special_char, pair.first - offset);
|
2016-04-11 18:39:51 +00:00
|
|
|
offset = pair.first + pair.second;
|
|
|
|
}
|
2022-02-04 13:31:07 +00:00
|
|
|
// The rest of the structure is filled with the special characters.
|
|
|
|
// ccccc[skipped]cccccccc[skiped]cccccccc[skipped]cccccccccccccccc
|
2020-04-08 21:37:01 +00:00
|
|
|
std::memset(start_ptr + offset, special_char, total_size - offset);
|
2016-04-11 18:39:51 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
int NumUnsetBytes(char* start_ptr, size_t total_size,
|
2020-06-19 22:26:05 +00:00
|
|
|
const OffsetGap& excluded) {
|
2016-04-11 18:39:51 +00:00
|
|
|
int total_unset_bytes_base = 0;
|
|
|
|
size_t offset = 0;
|
2020-06-19 22:26:05 +00:00
|
|
|
for (auto& pair : excluded) {
|
2022-02-04 13:31:07 +00:00
|
|
|
// The first part of the structure contains memory spaces that can be
|
|
|
|
// set (pair.first), and memory spaces that cannot be set (pair.second).
|
|
|
|
// Therefore total_unset_bytes_base only agregates bytes set to kSpecialChar
|
|
|
|
// in the pair.first bytes, but skips the pair.second bytes (padding bytes).
|
2016-04-11 18:39:51 +00:00
|
|
|
for (char* ptr = start_ptr + offset; ptr < start_ptr + pair.first; ptr++) {
|
|
|
|
if (*ptr == kSpecialChar) {
|
|
|
|
total_unset_bytes_base++;
|
|
|
|
}
|
|
|
|
}
|
2016-04-11 21:57:27 +00:00
|
|
|
offset = pair.first + pair.second;
|
2016-04-11 18:39:51 +00:00
|
|
|
}
|
2022-02-04 13:31:07 +00:00
|
|
|
// Then total_unset_bytes_base aggregates the bytes
|
|
|
|
// set to kSpecialChar in the rest of the structure
|
2016-04-11 18:39:51 +00:00
|
|
|
for (char* ptr = start_ptr + offset; ptr < start_ptr + total_size; ptr++) {
|
|
|
|
if (*ptr == kSpecialChar) {
|
|
|
|
total_unset_bytes_base++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return total_unset_bytes_base;
|
|
|
|
}
|
|
|
|
|
2020-06-19 22:26:05 +00:00
|
|
|
// Return true iff two structs are the same except excluded fields.
|
2020-04-08 21:37:01 +00:00
|
|
|
bool CompareBytes(char* start_ptr1, char* start_ptr2, size_t total_size,
|
2020-06-19 22:26:05 +00:00
|
|
|
const OffsetGap& excluded) {
|
2020-04-08 21:37:01 +00:00
|
|
|
size_t offset = 0;
|
2020-06-19 22:26:05 +00:00
|
|
|
for (auto& pair : excluded) {
|
2020-04-08 21:37:01 +00:00
|
|
|
for (; offset < pair.first; offset++) {
|
|
|
|
if (*(start_ptr1 + offset) != *(start_ptr2 + offset)) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
offset = pair.first + pair.second;
|
|
|
|
}
|
|
|
|
for (; offset < total_size; offset++) {
|
|
|
|
if (*(start_ptr1 + offset) != *(start_ptr2 + offset)) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2016-04-11 18:39:51 +00:00
|
|
|
// If the test fails, likely a new option is added to BlockBasedTableOptions
|
|
|
|
// but it cannot be set through GetBlockBasedTableOptionsFromString(), or the
|
|
|
|
// test is not updated accordingly.
|
|
|
|
// After adding an option, we need to make sure it is settable by
|
|
|
|
// GetBlockBasedTableOptionsFromString() and add the option to the input string
|
|
|
|
// passed to the GetBlockBasedTableOptionsFromString() in this test.
|
|
|
|
// If it is a complicated type, you also need to add the field to
|
2020-06-19 22:26:05 +00:00
|
|
|
// kBbtoExcluded, and maybe add customized verification for it.
|
2016-04-11 18:39:51 +00:00
|
|
|
TEST_F(OptionsSettableTest, BlockBasedTableOptionsAllFieldsSettable) {
|
|
|
|
// Items in the form of <offset, size>. Need to be in ascending order
|
2022-05-17 22:01:51 +00:00
|
|
|
// and not overlapping. Need to update if new option to be excluded is added
|
|
|
|
// (e.g, pointer-type)
|
2020-06-19 22:26:05 +00:00
|
|
|
const OffsetGap kBbtoExcluded = {
|
2016-04-11 18:39:51 +00:00
|
|
|
{offsetof(struct BlockBasedTableOptions, flush_block_policy_factory),
|
|
|
|
sizeof(std::shared_ptr<FlushBlockPolicyFactory>)},
|
|
|
|
{offsetof(struct BlockBasedTableOptions, block_cache),
|
|
|
|
sizeof(std::shared_ptr<Cache>)},
|
2015-12-16 02:20:10 +00:00
|
|
|
{offsetof(struct BlockBasedTableOptions, persistent_cache),
|
|
|
|
sizeof(std::shared_ptr<PersistentCache>)},
|
2022-05-17 22:01:51 +00:00
|
|
|
{offsetof(struct BlockBasedTableOptions, cache_usage_options),
|
|
|
|
sizeof(CacheUsageOptions)},
|
2016-04-11 18:39:51 +00:00
|
|
|
{offsetof(struct BlockBasedTableOptions, filter_policy),
|
|
|
|
sizeof(std::shared_ptr<const FilterPolicy>)},
|
|
|
|
};
|
|
|
|
|
|
|
|
// In this test, we catch a new option of BlockBasedTableOptions that is not
|
|
|
|
// settable through GetBlockBasedTableOptionsFromString().
|
|
|
|
// We count padding bytes of the option struct, and assert it to be the same
|
|
|
|
// as unset bytes of an option struct initialized by
|
|
|
|
// GetBlockBasedTableOptionsFromString().
|
|
|
|
|
|
|
|
char* bbto_ptr = new char[sizeof(BlockBasedTableOptions)];
|
|
|
|
|
|
|
|
// Count padding bytes by setting all bytes in the memory to a special char,
|
|
|
|
// copy a well constructed struct to this memory and see how many special
|
|
|
|
// bytes left.
|
|
|
|
BlockBasedTableOptions* bbto = new (bbto_ptr) BlockBasedTableOptions();
|
2020-06-19 22:26:05 +00:00
|
|
|
FillWithSpecialChar(bbto_ptr, sizeof(BlockBasedTableOptions), kBbtoExcluded);
|
2016-04-11 18:39:51 +00:00
|
|
|
// It based on the behavior of compiler that padding bytes are not changed
|
|
|
|
// when copying the struct. It's prone to failure when compiler behavior
|
|
|
|
// changes. We verify there is unset bytes to detect the case.
|
|
|
|
*bbto = BlockBasedTableOptions();
|
|
|
|
int unset_bytes_base =
|
2020-06-19 22:26:05 +00:00
|
|
|
NumUnsetBytes(bbto_ptr, sizeof(BlockBasedTableOptions), kBbtoExcluded);
|
2016-04-11 18:39:51 +00:00
|
|
|
ASSERT_GT(unset_bytes_base, 0);
|
|
|
|
bbto->~BlockBasedTableOptions();
|
|
|
|
|
|
|
|
// Construct the base option passed into
|
|
|
|
// GetBlockBasedTableOptionsFromString().
|
|
|
|
bbto = new (bbto_ptr) BlockBasedTableOptions();
|
2020-06-19 22:26:05 +00:00
|
|
|
FillWithSpecialChar(bbto_ptr, sizeof(BlockBasedTableOptions), kBbtoExcluded);
|
2016-04-11 18:39:51 +00:00
|
|
|
|
|
|
|
char* new_bbto_ptr = new char[sizeof(BlockBasedTableOptions)];
|
|
|
|
BlockBasedTableOptions* new_bbto =
|
|
|
|
new (new_bbto_ptr) BlockBasedTableOptions();
|
|
|
|
FillWithSpecialChar(new_bbto_ptr, sizeof(BlockBasedTableOptions),
|
2020-06-19 22:26:05 +00:00
|
|
|
kBbtoExcluded);
|
2016-04-11 18:39:51 +00:00
|
|
|
|
|
|
|
// Need to update the option string if a new option is added.
|
2023-02-07 22:11:53 +00:00
|
|
|
ConfigOptions config_options;
|
|
|
|
config_options.input_strings_escaped = false;
|
|
|
|
config_options.ignore_unknown_options = false;
|
|
|
|
config_options.invoke_prepare_options = false;
|
|
|
|
config_options.ignore_unsupported_options = false;
|
2016-04-11 18:39:51 +00:00
|
|
|
ASSERT_OK(GetBlockBasedTableOptionsFromString(
|
2023-02-07 22:11:53 +00:00
|
|
|
config_options, *bbto,
|
2016-04-11 18:39:51 +00:00
|
|
|
"cache_index_and_filter_blocks=1;"
|
2016-08-23 20:44:13 +00:00
|
|
|
"cache_index_and_filter_blocks_with_high_priority=true;"
|
2020-10-11 21:52:49 +00:00
|
|
|
"metadata_cache_options={top_level_index_pinning=kFallback;"
|
|
|
|
"partition_pinning=kAll;"
|
|
|
|
"unpartitioned_pinning=kFlushedAndSimilar;};"
|
2016-04-11 18:39:51 +00:00
|
|
|
"pin_l0_filter_and_index_blocks_in_cache=1;"
|
2018-06-22 22:14:05 +00:00
|
|
|
"pin_top_level_index_and_filter=1;"
|
2016-04-11 18:39:51 +00:00
|
|
|
"index_type=kHashSearch;"
|
2018-08-15 21:27:47 +00:00
|
|
|
"data_block_index_type=kDataBlockBinaryAndHash;"
|
2019-04-22 15:17:45 +00:00
|
|
|
"index_shortening=kNoShortening;"
|
2018-08-15 21:27:47 +00:00
|
|
|
"data_block_hash_table_util_ratio=0.75;"
|
2022-03-01 21:58:02 +00:00
|
|
|
"checksum=kxxHash;no_block_cache=1;"
|
2016-04-11 18:39:51 +00:00
|
|
|
"block_cache=1M;block_cache_compressed=1k;block_size=1024;"
|
|
|
|
"block_size_deviation=8;block_restart_interval=4; "
|
2017-03-28 18:56:56 +00:00
|
|
|
"metadata_block_size=1024;"
|
2017-03-07 21:48:02 +00:00
|
|
|
"partition_filters=false;"
|
Option to decouple index and filter partitions (#12939)
Summary:
Partitioned metadata blocks were introduced back in 2017 to deal more gracefully with large DBs where RAM is relatively scarce and some data might be much colder than other data. The feature allows metadata blocks to compete for memory in the block cache against data blocks while alleviating tail latencies and thrash conditions that can arise with large metadata blocks (sometimes megabytes each) that can arise with large SST files. In general, the cost to partitioned metadata is more CPU in accesses (especially for filters where more binary search is needed before hashing can be used) and a bit more memory fragmentation and related overheads.
However the feature has always had a subtle limitation with a subtle effect on performance: index partitions and filter partitions must be cut at the same time, regardless of which wins the space race (hahaha) to metadata_block_size. Commonly filters will be a few times larger than indexes, so index partitions will be under-sized compared to filter (and data) blocks. While this does affect fragmentation and related overheads a bit, I suspect the bigger impact on performance is in the block cache. The coupling of the partition cuts would be defensible if the binary search done to find the filter block was used (on filter hit) to short-circuit binary search to an index partition, but that optimization has not been developed.
Consider two metadata blocks, an under-sized one and a normal-sized one, covering proportional sections of the key space with the same density of read queries. The under-sized one will be more prone to eviction from block cache because it is used less often. This is unfair because of its despite its proportionally smaller cost of keeping in block cache, and most of the cost of a miss to re-load it (random IO) is not proportional to the size (similar latency etc. up to ~32KB).
## This change
Adds a new table option decouple_partitioned_filters allows filter blocks and index blocks to be cut independently. To make this work, the partitioned filter block builder needs to know about the previous key, to generate an appropriate separator for the partition index. In most cases, BlockBasedTableBuilder already has easy access to the previous key to provide to the filter block builder.
This change includes refactoring to pass that previous key to the filter builder when available, with the filter building caching the previous key itself when unavailable, such as during compression dictionary training and some unit tests. Access to the previous key eliminates the need to track the previous prefix, which results in a small SST construction CPU win in prefix filtering cases, regardless of coupling, and possibly a small regression for some non-prefix cases, regardless of coupling, but still overall improvement especially with https://github.com/facebook/rocksdb/issues/12931.
Suggested follow-up:
* Update confusing use of "last key" to refer to "previous key"
* Expand unit test coverage with parallel compression and dictionary training
* Consider an option or enhancement to alleviate under-sized metadata blocks "at the end" of an SST file due to no coordination or awareness of when files are cut.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12939
Test Plan:
unit tests updated. Also did some unit test runs with "hard wired" usage of parallel compression and dictionary training code paths to ensure they were working. Also ran blackbox_crash_test for a while with the new feature.
## SST write performance (CPU)
Using the same testing setup as in https://github.com/facebook/rocksdb/issues/12931 but with -decouple_partitioned_filters=1 in the "after" configuration, which benchmarking shows makes almost no difference in terms of SST write CPU. "After" vs. "before" this PR
```
-partition_index_and_filters=0 -prefix_size=0 -whole_key_filtering=1
923691 vs. 924851 (-0.13%)
-partition_index_and_filters=0 -prefix_size=8 -whole_key_filtering=0
921398 vs. 922973 (-0.17%)
-partition_index_and_filters=0 -prefix_size=8 -whole_key_filtering=1
902259 vs. 908756 (-0.71%)
-partition_index_and_filters=1 -prefix_size=8 -whole_key_filtering=0
917932 vs. 916901 (+0.60%)
-partition_index_and_filters=1 -prefix_size=8 -whole_key_filtering=0
912755 vs. 907298 (+0.60%)
-partition_index_and_filters=1 -prefix_size=8 -whole_key_filtering=1
899754 vs. 892433 (+0.82%)
```
I think this is a pretty good trade, especially in attracting more movement toward partitioned configurations.
## Read performance
Let's see how decoupling affects read performance across various degrees of memory constraint. To simplify LSM structure, we're using FIFO compaction. Since decoupling will overall increase metadata block size, we control for this somewhat with an extra "before" configuration with larger metadata block size setting (8k instead of 4k). Basic setup:
```
(for CS in 0300 1200; do TEST_TMPDIR=/dev/shm/rocksdb1 ./db_bench -benchmarks=fillrandom,flush,readrandom,block_cache_entry_stats -num=5000000 -duration=30 -disable_wal=1 -write_buffer_size=30000000 -bloom_bits=10 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=0 -partition_index_and_filters=1 -statistics=1 -cache_size=${CS}000000 -metadata_block_size=4096 -decouple_partitioned_filters=1 2>&1 | tee results-$CS; done)
```
And read ops/s results:
```CSV
Cache size MB,After/decoupled/4k,Before/4k,Before/8k
3,15593,15158,12826
6,16295,16693,14134
10,20427,20813,18459
20,27035,26836,27384
30,33250,31810,33846
60,35518,32585,35329
100,36612,31805,35292
300,35780,31492,35481
1000,34145,31551,35411
1100,35219,31380,34302
1200,35060,31037,34322
```
If you graph this with log scale on the X axis (internal link: https://pxl.cl/5qKRc), you see that the decoupled/4k configuration is essentially the best of both the before/4k and before/8k configurations: handles really tight memory closer to the old 4k configuration and handles generous memory closer to the old 8k configuration.
Reviewed By: jowlyzhang
Differential Revision: D61376772
Pulled By: pdillinger
fbshipit-source-id: fc2af2aee44290e2d9620f79651a30640799e01f
2024-08-16 22:34:31 +00:00
|
|
|
"decouple_partitioned_filters=true;"
|
Minimize memory internal fragmentation for Bloom filters (#6427)
Summary:
New experimental option BBTO::optimize_filters_for_memory builds
filters that maximize their use of "usable size" from malloc_usable_size,
which is also used to compute block cache charges.
Rather than always "rounding up," we track state in the
BloomFilterPolicy object to mix essentially "rounding down" and
"rounding up" so that the average FP rate of all generated filters is
the same as without the option. (YMMV as heavily accessed filters might
be unluckily lower accuracy.)
Thus, the option near-minimizes what the block cache considers as
"memory used" for a given target Bloom filter false positive rate and
Bloom filter implementation. There are no forward or backward
compatibility issues with this change, though it only works on the
format_version=5 Bloom filter.
With Jemalloc, we see about 10% reduction in memory footprint (and block
cache charge) for Bloom filters, but 1-2% increase in storage footprint,
due to encoding efficiency losses (FP rate is non-linear with bits/key).
Why not weighted random round up/down rather than state tracking? By
only requiring malloc_usable_size, we don't actually know what the next
larger and next smaller usable sizes for the allocator are. We pick a
requested size, accept and use whatever usable size it has, and use the
difference to inform our next choice. This allows us to narrow in on the
right balance without tracking/predicting usable sizes.
Why not weight history of generated filter false positive rates by
number of keys? This could lead to excess skew in small filters after
generating a large filter.
Results from filter_bench with jemalloc (irrelevant details omitted):
(normal keys/filter, but high variance)
$ ./filter_bench -quick -impl=2 -average_keys_per_filter=30000 -vary_key_count_ratio=0.9
Build avg ns/key: 29.6278
Number of filters: 5516
Total size (MB): 200.046
Reported total allocated memory (MB): 220.597
Reported internal fragmentation: 10.2732%
Bits/key stored: 10.0097
Average FP rate %: 0.965228
$ ./filter_bench -quick -impl=2 -average_keys_per_filter=30000 -vary_key_count_ratio=0.9 -optimize_filters_for_memory
Build avg ns/key: 30.5104
Number of filters: 5464
Total size (MB): 200.015
Reported total allocated memory (MB): 200.322
Reported internal fragmentation: 0.153709%
Bits/key stored: 10.1011
Average FP rate %: 0.966313
(very few keys / filter, optimization not as effective due to ~59 byte
internal fragmentation in blocked Bloom filter representation)
$ ./filter_bench -quick -impl=2 -average_keys_per_filter=1000 -vary_key_count_ratio=0.9
Build avg ns/key: 29.5649
Number of filters: 162950
Total size (MB): 200.001
Reported total allocated memory (MB): 224.624
Reported internal fragmentation: 12.3117%
Bits/key stored: 10.2951
Average FP rate %: 0.821534
$ ./filter_bench -quick -impl=2 -average_keys_per_filter=1000 -vary_key_count_ratio=0.9 -optimize_filters_for_memory
Build avg ns/key: 31.8057
Number of filters: 159849
Total size (MB): 200
Reported total allocated memory (MB): 208.846
Reported internal fragmentation: 4.42297%
Bits/key stored: 10.4948
Average FP rate %: 0.811006
(high keys/filter)
$ ./filter_bench -quick -impl=2 -average_keys_per_filter=1000000 -vary_key_count_ratio=0.9
Build avg ns/key: 29.7017
Number of filters: 164
Total size (MB): 200.352
Reported total allocated memory (MB): 221.5
Reported internal fragmentation: 10.5552%
Bits/key stored: 10.0003
Average FP rate %: 0.969358
$ ./filter_bench -quick -impl=2 -average_keys_per_filter=1000000 -vary_key_count_ratio=0.9 -optimize_filters_for_memory
Build avg ns/key: 30.7131
Number of filters: 160
Total size (MB): 200.928
Reported total allocated memory (MB): 200.938
Reported internal fragmentation: 0.00448054%
Bits/key stored: 10.1852
Average FP rate %: 0.963387
And from db_bench (block cache) with jemalloc:
$ ./db_bench -db=/dev/shm/dbbench.no_optimize -benchmarks=fillrandom -format_version=5 -value_size=90 -bloom_bits=10 -num=2000000 -threads=8 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=false
$ ./db_bench -db=/dev/shm/dbbench -benchmarks=fillrandom -format_version=5 -value_size=90 -bloom_bits=10 -num=2000000 -threads=8 -optimize_filters_for_memory -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=false
$ (for FILE in /dev/shm/dbbench.no_optimize/*.sst; do ./sst_dump --file=$FILE --show_properties | grep 'filter block' ; done) | awk '{ t += $4; } END { print t; }'
17063835
$ (for FILE in /dev/shm/dbbench/*.sst; do ./sst_dump --file=$FILE --show_properties | grep 'filter block' ; done) | awk '{ t += $4; } END { print t; }'
17430747
$ #^ 2.1% additional filter storage
$ ./db_bench -db=/dev/shm/dbbench.no_optimize -use_existing_db -benchmarks=readrandom,stats -statistics -bloom_bits=10 -num=2000000 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=false -duration=10 -cache_index_and_filter_blocks -cache_size=1000000000
rocksdb.block.cache.index.add COUNT : 33
rocksdb.block.cache.index.bytes.insert COUNT : 8440400
rocksdb.block.cache.filter.add COUNT : 33
rocksdb.block.cache.filter.bytes.insert COUNT : 21087528
rocksdb.bloom.filter.useful COUNT : 4963889
rocksdb.bloom.filter.full.positive COUNT : 1214081
rocksdb.bloom.filter.full.true.positive COUNT : 1161999
$ #^ 1.04 % observed FP rate
$ ./db_bench -db=/dev/shm/dbbench -use_existing_db -benchmarks=readrandom,stats -statistics -bloom_bits=10 -num=2000000 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=false -optimize_filters_for_memory -duration=10 -cache_index_and_filter_blocks -cache_size=1000000000
rocksdb.block.cache.index.add COUNT : 33
rocksdb.block.cache.index.bytes.insert COUNT : 8448592
rocksdb.block.cache.filter.add COUNT : 33
rocksdb.block.cache.filter.bytes.insert COUNT : 18220328
rocksdb.bloom.filter.useful COUNT : 5360933
rocksdb.bloom.filter.full.positive COUNT : 1321315
rocksdb.bloom.filter.full.true.positive COUNT : 1262999
$ #^ 1.08 % observed FP rate, 13.6% less memory usage for filters
(Due to specific key density, this example tends to generate filters that are "worse than average" for internal fragmentation. "Better than average" cases can show little or no improvement.)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6427
Test Plan: unit test added, 'make check' with gcc, clang and valgrind
Reviewed By: siying
Differential Revision: D22124374
Pulled By: pdillinger
fbshipit-source-id: f3e3aa152f9043ddf4fae25799e76341d0d8714e
2020-06-22 20:30:57 +00:00
|
|
|
"optimize_filters_for_memory=true;"
|
2024-02-23 22:13:28 +00:00
|
|
|
"use_delta_encoding=true;"
|
2016-04-11 18:39:51 +00:00
|
|
|
"index_block_restart_interval=4;"
|
Detect (new) Bloom/Ribbon Filter construction corruption (#9342)
Summary:
Note: rebase on and merge after https://github.com/facebook/rocksdb/pull/9349, https://github.com/facebook/rocksdb/pull/9345, (optional) https://github.com/facebook/rocksdb/pull/9393
**Context:**
(Quoted from pdillinger) Layers of information during new Bloom/Ribbon Filter construction in building block-based tables includes the following:
a) set of keys to add to filter
b) set of hashes to add to filter (64-bit hash applied to each key)
c) set of Bloom indices to set in filter, with duplicates
d) set of Bloom indices to set in filter, deduplicated
e) final filter and its checksum
This PR aims to detect corruption (e.g, unexpected hardware/software corruption on data structures residing in the memory for a long time) from b) to e) and leave a) as future works for application level.
- b)'s corruption is detected by verifying the xor checksum of the hash entries calculated as the entries accumulate before being added to the filter. (i.e, `XXPH3FilterBitsBuilder::MaybeVerifyHashEntriesChecksum()`)
- c) - e)'s corruption is detected by verifying the hash entries indeed exists in the constructed filter by re-querying these hash entries in the filter (i.e, `FilterBitsBuilder::MaybePostVerify()`) after computing the block checksum (except for PartitionFilter, which is done right after each `FilterBitsBuilder::Finish` for impl simplicity - see code comment for more). For this stage of detection, we assume hash entries are not corrupted after checking on b) since the time interval from b) to c) is relatively short IMO.
Option to enable this feature of detection is `BlockBasedTableOptions::detect_filter_construct_corruption` which is false by default.
**Summary:**
- Implemented new functions `XXPH3FilterBitsBuilder::MaybeVerifyHashEntriesChecksum()` and `FilterBitsBuilder::MaybePostVerify()`
- Ensured hash entries, final filter and banding and their [cache reservation ](https://github.com/facebook/rocksdb/issues/9073) are released properly despite corruption
- See [Filter.construction.artifacts.release.point.pdf ](https://github.com/facebook/rocksdb/files/7923487/Design.Filter.construction.artifacts.release.point.pdf) for high-level design
- Bundled and refactored hash entries's related artifact in XXPH3FilterBitsBuilder into `HashEntriesInfo` for better control on lifetime of these artifact during `SwapEntires`, `ResetEntries`
- Ensured RocksDB block-based table builder calls `FilterBitsBuilder::MaybePostVerify()` after constructing the filter by `FilterBitsBuilder::Finish()`
- When encountering such filter construction corruption, stop writing the filter content to files and mark such a block-based table building non-ok by storing the corruption status in the builder.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9342
Test Plan:
- Added new unit test `DBFilterConstructionCorruptionTestWithParam.DetectCorruption`
- Included this new feature in `DBFilterConstructionReserveMemoryTestWithParam.ReserveMemory` as this feature heavily touch ReserveMemory's impl
- For fallback case, I run `./filter_bench -impl=3 -detect_filter_construct_corruption=true -reserve_table_builder_memory=true -strict_capacity_limit=true -quick -runs 10 | grep 'Build avg'` to make sure nothing break.
- Added to `filter_bench`: increased filter construction time by **30%**, mostly by `MaybePostVerify()`
- FastLocalBloom
- Before change: `./filter_bench -impl=2 -quick -runs 10 | grep 'Build avg'`: **28.86643s**
- After change:
- `./filter_bench -impl=2 -detect_filter_construct_corruption=false -quick -runs 10 | grep 'Build avg'` (expect a tiny increase due to MaybePostVerify is always called regardless): **27.6644s (-4% perf improvement might be due to now we don't drop bloom hash entry in `AddAllEntries` along iteration but in bulk later, same with the bypassing-MaybePostVerify case below)**
- `./filter_bench -impl=2 -detect_filter_construct_corruption=true -quick -runs 10 | grep 'Build avg'` (expect acceptable increase): **34.41159s (+20%)**
- `./filter_bench -impl=2 -detect_filter_construct_corruption=true -quick -runs 10 | grep 'Build avg'` (by-passing MaybePostVerify, expect minor increase): **27.13431s (-6%)**
- Standard128Ribbon
- Before change: `./filter_bench -impl=3 -quick -runs 10 | grep 'Build avg'`: **122.5384s**
- After change:
- `./filter_bench -impl=3 -detect_filter_construct_corruption=false -quick -runs 10 | grep 'Build avg'` (expect a tiny increase due to MaybePostVerify is always called regardless - verified by removing MaybePostVerify under this case and found only +-1ns difference): **124.3588s (+2%)**
- `./filter_bench -impl=3 -detect_filter_construct_corruption=true -quick -runs 10 | grep 'Build avg'`(expect acceptable increase): **159.4946s (+30%)**
- `./filter_bench -impl=3 -detect_filter_construct_corruption=true -quick -runs 10 | grep 'Build avg'`(by-passing MaybePostVerify, expect minor increase) : **125.258s (+2%)**
- Added to `db_stress`: `make crash_test`, `./db_stress --detect_filter_construct_corruption=true`
- Manually smoke-tested: manually corrupted the filter construction in some db level tests with basic PUT and background flush. As expected, the error did get returned to users in subsequent PUT and Flush status.
Reviewed By: pdillinger
Differential Revision: D33746928
Pulled By: hx235
fbshipit-source-id: cb056426be5a7debc1cd16f23bc250f36a08ca57
2022-02-02 01:41:20 +00:00
|
|
|
"filter_policy=bloomfilter:4:true;whole_key_filtering=1;detect_filter_"
|
|
|
|
"construct_corruption=false;"
|
2017-03-03 00:45:07 +00:00
|
|
|
"format_version=1;"
|
2018-01-10 23:06:29 +00:00
|
|
|
"verify_compression=true;read_amp_bytes_per_bit=0;"
|
2018-03-27 03:14:24 +00:00
|
|
|
"enable_index_compression=false;"
|
2021-02-24 00:52:35 +00:00
|
|
|
"block_align=true;"
|
2021-06-18 04:55:42 +00:00
|
|
|
"max_auto_readahead_size=0;"
|
2022-04-16 00:28:09 +00:00
|
|
|
"prepopulate_block_cache=kDisable;"
|
2022-09-01 18:56:00 +00:00
|
|
|
"initial_auto_readahead_size=0;"
|
|
|
|
"num_file_reads_for_auto_readahead=0",
|
2016-04-11 18:39:51 +00:00
|
|
|
new_bbto));
|
|
|
|
|
|
|
|
ASSERT_EQ(unset_bytes_base,
|
|
|
|
NumUnsetBytes(new_bbto_ptr, sizeof(BlockBasedTableOptions),
|
2020-06-19 22:26:05 +00:00
|
|
|
kBbtoExcluded));
|
2016-04-11 18:39:51 +00:00
|
|
|
|
|
|
|
ASSERT_TRUE(new_bbto->block_cache.get() != nullptr);
|
|
|
|
ASSERT_TRUE(new_bbto->filter_policy.get() != nullptr);
|
|
|
|
|
|
|
|
bbto->~BlockBasedTableOptions();
|
|
|
|
new_bbto->~BlockBasedTableOptions();
|
|
|
|
|
|
|
|
delete[] bbto_ptr;
|
|
|
|
delete[] new_bbto_ptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
// If the test fails, likely a new option is added to DBOptions
|
|
|
|
// but it cannot be set through GetDBOptionsFromString(), or the test is not
|
|
|
|
// updated accordingly.
|
|
|
|
// After adding an option, we need to make sure it is settable by
|
|
|
|
// GetDBOptionsFromString() and add the option to the input string passed to
|
|
|
|
// DBOptionsFromString()in this test.
|
|
|
|
// If it is a complicated type, you also need to add the field to
|
2020-06-19 22:26:05 +00:00
|
|
|
// kDBOptionsExcluded, and maybe add customized verification for it.
|
2016-04-11 18:39:51 +00:00
|
|
|
TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) {
|
2020-06-19 22:26:05 +00:00
|
|
|
const OffsetGap kDBOptionsExcluded = {
|
2016-04-11 18:39:51 +00:00
|
|
|
{offsetof(struct DBOptions, env), sizeof(Env*)},
|
|
|
|
{offsetof(struct DBOptions, rate_limiter),
|
|
|
|
sizeof(std::shared_ptr<RateLimiter>)},
|
|
|
|
{offsetof(struct DBOptions, sst_file_manager),
|
|
|
|
sizeof(std::shared_ptr<SstFileManager>)},
|
|
|
|
{offsetof(struct DBOptions, info_log), sizeof(std::shared_ptr<Logger>)},
|
|
|
|
{offsetof(struct DBOptions, statistics),
|
|
|
|
sizeof(std::shared_ptr<Statistics>)},
|
|
|
|
{offsetof(struct DBOptions, db_paths), sizeof(std::vector<DbPath>)},
|
|
|
|
{offsetof(struct DBOptions, db_log_dir), sizeof(std::string)},
|
|
|
|
{offsetof(struct DBOptions, wal_dir), sizeof(std::string)},
|
2016-06-21 01:01:03 +00:00
|
|
|
{offsetof(struct DBOptions, write_buffer_manager),
|
|
|
|
sizeof(std::shared_ptr<WriteBufferManager>)},
|
2016-04-11 18:39:51 +00:00
|
|
|
{offsetof(struct DBOptions, listeners),
|
|
|
|
sizeof(std::vector<std::shared_ptr<EventListener>>)},
|
|
|
|
{offsetof(struct DBOptions, row_cache), sizeof(std::shared_ptr<Cache>)},
|
|
|
|
{offsetof(struct DBOptions, wal_filter), sizeof(const WalFilter*)},
|
2020-03-29 22:57:02 +00:00
|
|
|
{offsetof(struct DBOptions, file_checksum_gen_factory),
|
|
|
|
sizeof(std::shared_ptr<FileChecksumGenFactory>)},
|
2020-10-19 18:37:05 +00:00
|
|
|
{offsetof(struct DBOptions, db_host_id), sizeof(std::string)},
|
2021-02-11 06:18:33 +00:00
|
|
|
{offsetof(struct DBOptions, checksum_handoff_file_types),
|
|
|
|
sizeof(FileTypeSet)},
|
2021-05-20 04:40:43 +00:00
|
|
|
{offsetof(struct DBOptions, compaction_service),
|
|
|
|
sizeof(std::shared_ptr<CompactionService>)},
|
Offpeak in db option (#11893)
Summary:
RocksDB's primary function is to facilitate read and write operations. Compactions, while essential for minimizing read amplifications and optimizing storage, can sometimes compete with these primary tasks. Especially during periods of high read/write traffic, it's vital to ensure that primary operations receive priority, avoiding any potential disruptions or slowdowns. Conversely, during off-peak times when traffic is minimal, it's an opportune moment to tackle low-priority tasks like TTL based compactions, optimizing resource usage.
In this PR, we are incorporating the concept of off-peak time into RocksDB by introducing `daily_offpeak_time_utc` within the DBOptions. This setting is formatted as "HH:mm-HH:mm" where the first one before "-" is the start time and the second one is the end time, inclusive. It will be later used for resource optimization in subsequent PRs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11893
Test Plan:
- New Unit Test Added - `DBOptionsTest::OffPeakTimes`
- Existing Unit Test Updated - `OptionsTest`, `OptionsSettableTest`
Reviewed By: pdillinger
Differential Revision: D49714553
Pulled By: jaykorean
fbshipit-source-id: fef51ea7c0fede6431c715bff116ddbb567c8752
2023-09-29 20:03:39 +00:00
|
|
|
{offsetof(struct DBOptions, daily_offpeak_time_utc), sizeof(std::string)},
|
2016-04-11 18:39:51 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
char* options_ptr = new char[sizeof(DBOptions)];
|
|
|
|
|
|
|
|
// Count padding bytes by setting all bytes in the memory to a special char,
|
|
|
|
// copy a well constructed struct to this memory and see how many special
|
|
|
|
// bytes left.
|
|
|
|
DBOptions* options = new (options_ptr) DBOptions();
|
2020-06-19 22:26:05 +00:00
|
|
|
FillWithSpecialChar(options_ptr, sizeof(DBOptions), kDBOptionsExcluded);
|
2016-04-11 18:39:51 +00:00
|
|
|
// It based on the behavior of compiler that padding bytes are not changed
|
|
|
|
// when copying the struct. It's prone to failure when compiler behavior
|
|
|
|
// changes. We verify there is unset bytes to detect the case.
|
|
|
|
*options = DBOptions();
|
|
|
|
int unset_bytes_base =
|
2020-06-19 22:26:05 +00:00
|
|
|
NumUnsetBytes(options_ptr, sizeof(DBOptions), kDBOptionsExcluded);
|
2016-04-11 18:39:51 +00:00
|
|
|
ASSERT_GT(unset_bytes_base, 0);
|
|
|
|
options->~DBOptions();
|
|
|
|
|
2024-09-26 21:36:29 +00:00
|
|
|
// Now also check that BuildDBOptions populates everything
|
|
|
|
FillWithSpecialChar(options_ptr, sizeof(DBOptions), kDBOptionsExcluded);
|
|
|
|
BuildDBOptions({}, {}, *options);
|
|
|
|
ASSERT_EQ(unset_bytes_base,
|
|
|
|
NumUnsetBytes(options_ptr, sizeof(DBOptions), kDBOptionsExcluded));
|
|
|
|
|
2016-04-11 18:39:51 +00:00
|
|
|
options = new (options_ptr) DBOptions();
|
2020-06-19 22:26:05 +00:00
|
|
|
FillWithSpecialChar(options_ptr, sizeof(DBOptions), kDBOptionsExcluded);
|
2016-04-11 18:39:51 +00:00
|
|
|
|
|
|
|
char* new_options_ptr = new char[sizeof(DBOptions)];
|
|
|
|
DBOptions* new_options = new (new_options_ptr) DBOptions();
|
2020-06-19 22:26:05 +00:00
|
|
|
FillWithSpecialChar(new_options_ptr, sizeof(DBOptions), kDBOptionsExcluded);
|
2016-04-11 18:39:51 +00:00
|
|
|
|
|
|
|
// Need to update the option string if a new option is added.
|
2023-02-07 22:11:53 +00:00
|
|
|
ConfigOptions config_options(*options);
|
|
|
|
config_options.input_strings_escaped = false;
|
|
|
|
config_options.ignore_unknown_options = false;
|
2016-04-11 18:39:51 +00:00
|
|
|
ASSERT_OK(
|
2023-02-07 22:11:53 +00:00
|
|
|
GetDBOptionsFromString(config_options, *options,
|
2016-04-11 18:39:51 +00:00
|
|
|
"wal_bytes_per_sync=4295048118;"
|
|
|
|
"delete_obsolete_files_period_micros=4294967758;"
|
|
|
|
"WAL_ttl_seconds=4295008036;"
|
|
|
|
"WAL_size_limit_MB=4295036161;"
|
2019-09-12 01:26:22 +00:00
|
|
|
"max_write_batch_group_size_bytes=1048576;"
|
2016-04-11 18:39:51 +00:00
|
|
|
"wal_dir=path/to/wal_dir;"
|
|
|
|
"db_write_buffer_size=2587;"
|
|
|
|
"max_subcompactions=64330;"
|
|
|
|
"table_cache_numshardbits=28;"
|
|
|
|
"max_open_files=72;"
|
|
|
|
"max_file_opening_threads=35;"
|
2017-05-24 18:25:38 +00:00
|
|
|
"max_background_jobs=8;"
|
2016-04-11 18:39:51 +00:00
|
|
|
"max_background_compactions=33;"
|
|
|
|
"use_fsync=true;"
|
|
|
|
"use_adaptive_mutex=false;"
|
|
|
|
"max_total_wal_size=4295005604;"
|
|
|
|
"compaction_readahead_size=0;"
|
|
|
|
"keep_log_file_num=4890;"
|
|
|
|
"skip_stats_update_on_db_open=false;"
|
Add an option to prevent DB::Open() from querying sizes of all sst files (#6353)
Summary:
When paranoid_checks is on, DBImpl::CheckConsistency() iterates over all sst files and calls Env::GetFileSize() for each of them. As far as I could understand, this is pretty arbitrary and doesn't affect correctness - if filesystem doesn't corrupt fsynced files, the file sizes will always match; if it does, it may as well corrupt contents as well as sizes, and rocksdb doesn't check contents on open.
If there are thousands of sst files, getting all their sizes takes a while. If, on top of that, Env is overridden to use some remote storage instead of local filesystem, it can be *really* slow and overload the remote storage service. This PR adds an option to not do GetFileSize(); instead it does GetChildren() for parent directory to check that all the expected sst files are at least present, but doesn't check their sizes.
We can't just disable paranoid_checks instead because paranoid_checks do a few other important things: make the DB read-only on write errors, print error messages on read errors, etc.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6353
Test Plan: ran the added sanity check unit test. Will try it out in a LogDevice test cluster where the GetFileSize() calls are causing a lot of trouble.
Differential Revision: D19656425
Pulled By: al13n321
fbshipit-source-id: c2c421b367633033760d1f56747bad206d1fbf82
2020-02-04 09:24:29 +00:00
|
|
|
"skip_checking_sst_file_sizes_on_db_open=false;"
|
2016-04-11 18:39:51 +00:00
|
|
|
"max_manifest_file_size=4295009941;"
|
|
|
|
"db_log_dir=path/to/db_log_dir;"
|
|
|
|
"writable_file_max_buffer_size=1048576;"
|
|
|
|
"paranoid_checks=true;"
|
2021-05-20 23:06:12 +00:00
|
|
|
"flush_verify_memtable_count=true;"
|
2023-07-28 16:47:31 +00:00
|
|
|
"compaction_verify_record_count=true;"
|
2020-10-09 23:40:25 +00:00
|
|
|
"track_and_verify_wals_in_manifest=true;"
|
2022-05-19 18:04:21 +00:00
|
|
|
"verify_sst_unique_id_in_manifest=true;"
|
2016-04-11 18:39:51 +00:00
|
|
|
"is_fd_close_on_exec=false;"
|
|
|
|
"bytes_per_sync=4295013613;"
|
Optionally wait on bytes_per_sync to smooth I/O (#5183)
Summary:
The existing implementation does not guarantee bytes reach disk every `bytes_per_sync` when writing SST files, or every `wal_bytes_per_sync` when writing WALs. This can cause confusing behavior for users who enable this feature to avoid large syncs during flush and compaction, but then end up hitting them anyways.
My understanding of the existing behavior is we used `sync_file_range` with `SYNC_FILE_RANGE_WRITE` to submit ranges for async writeback, such that we could continue processing the next range of bytes while that I/O is happening. I believe we can preserve that benefit while also limiting how far the processing can get ahead of the I/O, which prevents huge syncs from happening when the file finishes.
Consider this `sync_file_range` usage: `sync_file_range(fd_, 0, static_cast<off_t>(offset + nbytes), SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE)`. Expanding the range to start at 0 and adding the `SYNC_FILE_RANGE_WAIT_BEFORE` flag causes any pending writeback (like from a previous call to `sync_file_range`) to finish before it proceeds to submit the latest `nbytes` for writeback. The latest `nbytes` are still written back asynchronously, unless processing exceeds I/O speed, in which case the following `sync_file_range` will need to wait on it.
There is a second change in this PR to use `fdatasync` when `sync_file_range` is unavailable (determined statically) or has some known problem with the underlying filesystem (determined dynamically).
The above two changes only apply when the user enables a new option, `strict_bytes_per_sync`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5183
Differential Revision: D14953553
Pulled By: siying
fbshipit-source-id: 445c3862e019fb7b470f9c7f314fc231b62706e9
2019-04-22 18:48:45 +00:00
|
|
|
"strict_bytes_per_sync=true;"
|
2016-04-11 18:39:51 +00:00
|
|
|
"enable_thread_tracking=false;"
|
|
|
|
"recycle_log_file_num=0;"
|
|
|
|
"create_missing_column_families=true;"
|
|
|
|
"log_file_time_to_roll=3097;"
|
|
|
|
"max_background_flushes=35;"
|
|
|
|
"create_if_missing=false;"
|
|
|
|
"error_if_exists=true;"
|
|
|
|
"delayed_write_rate=4294976214;"
|
|
|
|
"manifest_preallocation_size=1222;"
|
|
|
|
"allow_mmap_writes=false;"
|
|
|
|
"stats_dump_period_sec=70127;"
|
2019-02-20 23:46:59 +00:00
|
|
|
"stats_persist_period_sec=54321;"
|
2019-06-17 22:17:43 +00:00
|
|
|
"persist_stats_to_disk=true;"
|
2019-02-20 23:46:59 +00:00
|
|
|
"stats_history_buffer_size=14159;"
|
2016-04-11 18:39:51 +00:00
|
|
|
"allow_fallocate=true;"
|
|
|
|
"allow_mmap_reads=false;"
|
2016-10-28 17:36:05 +00:00
|
|
|
"use_direct_reads=false;"
|
2017-04-13 20:07:33 +00:00
|
|
|
"use_direct_io_for_flush_and_compaction=false;"
|
2016-04-11 18:39:51 +00:00
|
|
|
"max_log_file_size=4607;"
|
|
|
|
"random_access_max_buffer_size=1048576;"
|
|
|
|
"advise_random_on_open=true;"
|
|
|
|
"fail_if_options_file_error=false;"
|
2017-05-19 21:24:23 +00:00
|
|
|
"enable_pipelined_write=false;"
|
2019-05-14 00:43:47 +00:00
|
|
|
"unordered_write=false;"
|
2016-04-11 18:39:51 +00:00
|
|
|
"allow_concurrent_memtable_write=true;"
|
|
|
|
"wal_recovery_mode=kPointInTimeRecovery;"
|
|
|
|
"enable_write_thread_adaptive_yield=true;"
|
|
|
|
"write_thread_slow_yield_usec=5;"
|
|
|
|
"write_thread_max_yield_usec=1000;"
|
2016-04-27 23:23:33 +00:00
|
|
|
"info_log_level=DEBUG_LEVEL;"
|
2016-04-18 18:11:51 +00:00
|
|
|
"dump_malloc_stats=false;"
|
2016-06-13 18:34:16 +00:00
|
|
|
"allow_2pc=false;"
|
2016-11-02 22:22:13 +00:00
|
|
|
"avoid_flush_during_recovery=false;"
|
2017-05-17 18:32:26 +00:00
|
|
|
"avoid_flush_during_shutdown=false;"
|
2017-06-24 21:06:43 +00:00
|
|
|
"allow_ingest_behind=false;"
|
|
|
|
"concurrent_prepare=false;"
|
2017-11-11 01:18:01 +00:00
|
|
|
"two_write_queues=false;"
|
2017-09-18 21:36:53 +00:00
|
|
|
"manual_wal_flush=false;"
|
2022-01-26 21:57:30 +00:00
|
|
|
"wal_compression=kZSTD;"
|
Ensure Close() before LinkFile() for WALs in Checkpoint (#12734)
Summary:
POSIX semantics for LinkFile (hard links) allow linking a file
that is still being written two, with both the source and destination
showing any subsequent writes to the source. This may not be practical
semantics for some FileSystem implementations such as remote storage.
They might only link the flushed or sync-ed file contents at time of
LinkFile, or might even have undefined behavior if LinkFile is called on
a file still open for write (not yet "sealed"). This change builds on https://github.com/facebook/rocksdb/issues/12731
to bring more hygiene to our handling of WAL files in Checkpoint.
Specifically, we now Close WAL files as soon as they are either
(a) inactive and fully synced, or (b) inactive and obsolete (so maybe
never fully synced), rather than letting Close() happen in handling
obsolete files (maybe a background thread). This should not be a
performance issue as Close() should be trivial cost relative to other
IO ops, but just in case:
* We don't Close() while holding a mutex, to avoid blocking, and
* The old behavior is available with a new kill switch option
`background_close_inactive_wals`.
Stacked on https://github.com/facebook/rocksdb/issues/12731
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12734
Test Plan:
Extended existing unit test, especially adding a hygiene
check to FaultInjectionTestFS to detect LinkFile() on a file still open
for writes. FaultInjectionTestFS already has relevant tracking data, and
tests can opt out of the new check, as in a smoke test I have left for
the old, deprecated functionality `background_close_inactive_wals=true`.
Also ran lengthy blackbox_crash_test to ensure the hygiene check is OK
with the crash test. (The only place I can find we use LinkFile in
production is Checkpoint.)
Reviewed By: cbi42
Differential Revision: D58295284
Pulled By: pdillinger
fbshipit-source-id: 64d90ed8477e2366c19eaf9c4c5ad60b82cac5c6
2024-06-12 18:48:45 +00:00
|
|
|
"background_close_inactive_wals=true;"
|
2018-10-26 22:06:44 +00:00
|
|
|
"seq_per_batch=false;"
|
2019-04-02 00:07:38 +00:00
|
|
|
"atomic_flush=false;"
|
2019-07-19 18:54:38 +00:00
|
|
|
"avoid_unnecessary_blocking_io=false;"
|
2019-09-03 15:50:47 +00:00
|
|
|
"log_readahead_size=0;"
|
2020-03-21 02:17:54 +00:00
|
|
|
"write_dbid_to_manifest=false;"
|
2020-07-15 18:02:44 +00:00
|
|
|
"best_efforts_recovery=false;"
|
|
|
|
"max_bgerror_resume_count=2;"
|
2022-05-20 23:48:50 +00:00
|
|
|
"bgerror_resume_retry_interval=1000000;"
|
2020-11-13 06:08:03 +00:00
|
|
|
"db_host_id=hostname;"
|
2021-10-19 22:53:16 +00:00
|
|
|
"lowest_used_cache_tier=kNonVolatileBlockTier;"
|
2022-05-20 23:48:50 +00:00
|
|
|
"allow_data_in_errors=false;"
|
Offpeak in db option (#11893)
Summary:
RocksDB's primary function is to facilitate read and write operations. Compactions, while essential for minimizing read amplifications and optimizing storage, can sometimes compete with these primary tasks. Especially during periods of high read/write traffic, it's vital to ensure that primary operations receive priority, avoiding any potential disruptions or slowdowns. Conversely, during off-peak times when traffic is minimal, it's an opportune moment to tackle low-priority tasks like TTL based compactions, optimizing resource usage.
In this PR, we are incorporating the concept of off-peak time into RocksDB by introducing `daily_offpeak_time_utc` within the DBOptions. This setting is formatted as "HH:mm-HH:mm" where the first one before "-" is the start time and the second one is the end time, inclusive. It will be later used for resource optimization in subsequent PRs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11893
Test Plan:
- New Unit Test Added - `DBOptionsTest::OffPeakTimes`
- Existing Unit Test Updated - `OptionsTest`, `OptionsSettableTest`
Reviewed By: pdillinger
Differential Revision: D49714553
Pulled By: jaykorean
fbshipit-source-id: fef51ea7c0fede6431c715bff116ddbb567c8752
2023-09-29 20:03:39 +00:00
|
|
|
"enforce_single_del_contracts=false;"
|
2024-08-24 02:49:25 +00:00
|
|
|
"daily_offpeak_time_utc=08:30-19:00;"
|
|
|
|
"follower_refresh_catchup_period_ms=123;"
|
|
|
|
"follower_catchup_retry_count=456;"
|
|
|
|
"follower_catchup_retry_wait_ms=789;"
|
|
|
|
"metadata_write_temperature=kCold;"
|
2024-09-26 21:36:29 +00:00
|
|
|
"wal_write_temperature=kHot;"
|
|
|
|
"background_close_inactive_wals=true;"
|
|
|
|
"write_dbid_to_manifest=true;"
|
|
|
|
"write_identity_file=true;"
|
|
|
|
"prefix_seek_opt_in_only=true;",
|
2016-04-11 18:39:51 +00:00
|
|
|
new_options));
|
|
|
|
|
|
|
|
ASSERT_EQ(unset_bytes_base, NumUnsetBytes(new_options_ptr, sizeof(DBOptions),
|
2020-06-19 22:26:05 +00:00
|
|
|
kDBOptionsExcluded));
|
2016-04-11 18:39:51 +00:00
|
|
|
|
|
|
|
options->~DBOptions();
|
|
|
|
new_options->~DBOptions();
|
|
|
|
|
|
|
|
delete[] options_ptr;
|
|
|
|
delete[] new_options_ptr;
|
|
|
|
}
|
|
|
|
|
2023-08-09 22:46:44 +00:00
|
|
|
// status check adds CXX flag -fno-elide-constructors which fails this test.
|
|
|
|
#ifndef ROCKSDB_ASSERT_STATUS_CHECKED
|
2016-04-11 18:39:51 +00:00
|
|
|
// If the test fails, likely a new option is added to ColumnFamilyOptions
|
|
|
|
// but it cannot be set through GetColumnFamilyOptionsFromString(), or the
|
|
|
|
// test is not updated accordingly.
|
|
|
|
// After adding an option, we need to make sure it is settable by
|
|
|
|
// GetColumnFamilyOptionsFromString() and add the option to the input
|
2022-06-14 21:19:26 +00:00
|
|
|
// string passed to GetColumnFamilyOptionsFromString() in this test.
|
2016-04-11 18:39:51 +00:00
|
|
|
// If it is a complicated type, you also need to add the field to
|
2020-06-19 22:26:05 +00:00
|
|
|
// kColumnFamilyOptionsExcluded, and maybe add customized verification
|
2016-04-11 18:39:51 +00:00
|
|
|
// for it.
|
|
|
|
TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) {
|
2020-06-19 22:26:05 +00:00
|
|
|
// options in the excluded set need to appear in the same order as in
|
2016-11-14 02:58:17 +00:00
|
|
|
// ColumnFamilyOptions.
|
2020-06-19 22:26:05 +00:00
|
|
|
const OffsetGap kColumnFamilyOptionsExcluded = {
|
Use -Wno-invalid-offsetof instead of dangerous offset_of hack (#9563)
Summary:
After https://github.com/facebook/rocksdb/issues/9515 added a unique_ptr to Status, we see some
warnings-as-error in some internal builds like this:
```
stderr: rocksdb/src/db/compaction/compaction_job.cc:2839:7: error:
offset of on non-standard-layout type 'struct CompactionServiceResult'
[-Werror,-Winvalid-offsetof]
{offsetof(struct CompactionServiceResult, status),
^ ~~~~~~
```
I see three potential solutions to resolving this:
* Expand our use of an idiom that works around the warning (see offset_of
functions removed in this change, inspired by
https://gist.github.com/graphitemaster/494f21190bb2c63c5516) However,
this construction is invoking undefined behavior that assumes consistent
layout with no compiler-introduced indirection. A compiler incompatible
with our assumptions will likely compile the code and exhibit undefined
behavior.
* Migrate to something in place of offset, like a function mapping
CompactionServiceResult* to Status* (for the `status` field). This might
be required in the long term.
* **Selected:** Use our new C++17 dependency to use offsetof in a well-defined way
when the compiler allows it. From a comment on
https://gist.github.com/graphitemaster/494f21190bb2c63c5516:
> A final note: in C++17, offsetof is conditionally supported, which
> means that you can use it on any type (not just standard layout
> types) and the compiler will error if it can't compile it correctly.
> That appears to be the best option if you can live with C++17 and
> don't need constexpr support.
The C++17 semantics are confirmed on
https://en.cppreference.com/w/cpp/types/offsetof, so we can suppress the
warning as long as we accept that we might run into a compiler that
rejects the code, and at that point we will find a solution, such as
the more intrusive "migrate" solution above.
Although this is currently only showing in our buck build, it will
surely show up also with make and cmake, so I have updated those
configurations as well.
Also in the buck build, -Wno-expansion-to-defined does not appear to be
needed anymore (both current compiler configurations) so I
removed it.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9563
Test Plan: Tried out buck builds with both current compiler configurations
Reviewed By: riversand963
Differential Revision: D34220931
Pulled By: pdillinger
fbshipit-source-id: d39436008259bd1eaaa87c77be69fb2a5b559e1f
2022-02-15 17:18:08 +00:00
|
|
|
{offsetof(struct ColumnFamilyOptions, inplace_callback),
|
2017-02-28 01:36:06 +00:00
|
|
|
sizeof(UpdateStatus(*)(char*, uint32_t*, Slice, std::string*))},
|
Use -Wno-invalid-offsetof instead of dangerous offset_of hack (#9563)
Summary:
After https://github.com/facebook/rocksdb/issues/9515 added a unique_ptr to Status, we see some
warnings-as-error in some internal builds like this:
```
stderr: rocksdb/src/db/compaction/compaction_job.cc:2839:7: error:
offset of on non-standard-layout type 'struct CompactionServiceResult'
[-Werror,-Winvalid-offsetof]
{offsetof(struct CompactionServiceResult, status),
^ ~~~~~~
```
I see three potential solutions to resolving this:
* Expand our use of an idiom that works around the warning (see offset_of
functions removed in this change, inspired by
https://gist.github.com/graphitemaster/494f21190bb2c63c5516) However,
this construction is invoking undefined behavior that assumes consistent
layout with no compiler-introduced indirection. A compiler incompatible
with our assumptions will likely compile the code and exhibit undefined
behavior.
* Migrate to something in place of offset, like a function mapping
CompactionServiceResult* to Status* (for the `status` field). This might
be required in the long term.
* **Selected:** Use our new C++17 dependency to use offsetof in a well-defined way
when the compiler allows it. From a comment on
https://gist.github.com/graphitemaster/494f21190bb2c63c5516:
> A final note: in C++17, offsetof is conditionally supported, which
> means that you can use it on any type (not just standard layout
> types) and the compiler will error if it can't compile it correctly.
> That appears to be the best option if you can live with C++17 and
> don't need constexpr support.
The C++17 semantics are confirmed on
https://en.cppreference.com/w/cpp/types/offsetof, so we can suppress the
warning as long as we accept that we might run into a compiler that
rejects the code, and at that point we will find a solution, such as
the more intrusive "migrate" solution above.
Although this is currently only showing in our buck build, it will
surely show up also with make and cmake, so I have updated those
configurations as well.
Also in the buck build, -Wno-expansion-to-defined does not appear to be
needed anymore (both current compiler configurations) so I
removed it.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9563
Test Plan: Tried out buck builds with both current compiler configurations
Reviewed By: riversand963
Differential Revision: D34220931
Pulled By: pdillinger
fbshipit-source-id: d39436008259bd1eaaa87c77be69fb2a5b559e1f
2022-02-15 17:18:08 +00:00
|
|
|
{offsetof(struct ColumnFamilyOptions,
|
|
|
|
memtable_insert_with_hint_prefix_extractor),
|
2016-04-11 18:39:51 +00:00
|
|
|
sizeof(std::shared_ptr<const SliceTransform>)},
|
Use -Wno-invalid-offsetof instead of dangerous offset_of hack (#9563)
Summary:
After https://github.com/facebook/rocksdb/issues/9515 added a unique_ptr to Status, we see some
warnings-as-error in some internal builds like this:
```
stderr: rocksdb/src/db/compaction/compaction_job.cc:2839:7: error:
offset of on non-standard-layout type 'struct CompactionServiceResult'
[-Werror,-Winvalid-offsetof]
{offsetof(struct CompactionServiceResult, status),
^ ~~~~~~
```
I see three potential solutions to resolving this:
* Expand our use of an idiom that works around the warning (see offset_of
functions removed in this change, inspired by
https://gist.github.com/graphitemaster/494f21190bb2c63c5516) However,
this construction is invoking undefined behavior that assumes consistent
layout with no compiler-introduced indirection. A compiler incompatible
with our assumptions will likely compile the code and exhibit undefined
behavior.
* Migrate to something in place of offset, like a function mapping
CompactionServiceResult* to Status* (for the `status` field). This might
be required in the long term.
* **Selected:** Use our new C++17 dependency to use offsetof in a well-defined way
when the compiler allows it. From a comment on
https://gist.github.com/graphitemaster/494f21190bb2c63c5516:
> A final note: in C++17, offsetof is conditionally supported, which
> means that you can use it on any type (not just standard layout
> types) and the compiler will error if it can't compile it correctly.
> That appears to be the best option if you can live with C++17 and
> don't need constexpr support.
The C++17 semantics are confirmed on
https://en.cppreference.com/w/cpp/types/offsetof, so we can suppress the
warning as long as we accept that we might run into a compiler that
rejects the code, and at that point we will find a solution, such as
the more intrusive "migrate" solution above.
Although this is currently only showing in our buck build, it will
surely show up also with make and cmake, so I have updated those
configurations as well.
Also in the buck build, -Wno-expansion-to-defined does not appear to be
needed anymore (both current compiler configurations) so I
removed it.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9563
Test Plan: Tried out buck builds with both current compiler configurations
Reviewed By: riversand963
Differential Revision: D34220931
Pulled By: pdillinger
fbshipit-source-id: d39436008259bd1eaaa87c77be69fb2a5b559e1f
2022-02-15 17:18:08 +00:00
|
|
|
{offsetof(struct ColumnFamilyOptions, compression_per_level),
|
2017-02-28 01:36:06 +00:00
|
|
|
sizeof(std::vector<CompressionType>)},
|
Use -Wno-invalid-offsetof instead of dangerous offset_of hack (#9563)
Summary:
After https://github.com/facebook/rocksdb/issues/9515 added a unique_ptr to Status, we see some
warnings-as-error in some internal builds like this:
```
stderr: rocksdb/src/db/compaction/compaction_job.cc:2839:7: error:
offset of on non-standard-layout type 'struct CompactionServiceResult'
[-Werror,-Winvalid-offsetof]
{offsetof(struct CompactionServiceResult, status),
^ ~~~~~~
```
I see three potential solutions to resolving this:
* Expand our use of an idiom that works around the warning (see offset_of
functions removed in this change, inspired by
https://gist.github.com/graphitemaster/494f21190bb2c63c5516) However,
this construction is invoking undefined behavior that assumes consistent
layout with no compiler-introduced indirection. A compiler incompatible
with our assumptions will likely compile the code and exhibit undefined
behavior.
* Migrate to something in place of offset, like a function mapping
CompactionServiceResult* to Status* (for the `status` field). This might
be required in the long term.
* **Selected:** Use our new C++17 dependency to use offsetof in a well-defined way
when the compiler allows it. From a comment on
https://gist.github.com/graphitemaster/494f21190bb2c63c5516:
> A final note: in C++17, offsetof is conditionally supported, which
> means that you can use it on any type (not just standard layout
> types) and the compiler will error if it can't compile it correctly.
> That appears to be the best option if you can live with C++17 and
> don't need constexpr support.
The C++17 semantics are confirmed on
https://en.cppreference.com/w/cpp/types/offsetof, so we can suppress the
warning as long as we accept that we might run into a compiler that
rejects the code, and at that point we will find a solution, such as
the more intrusive "migrate" solution above.
Although this is currently only showing in our buck build, it will
surely show up also with make and cmake, so I have updated those
configurations as well.
Also in the buck build, -Wno-expansion-to-defined does not appear to be
needed anymore (both current compiler configurations) so I
removed it.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9563
Test Plan: Tried out buck builds with both current compiler configurations
Reviewed By: riversand963
Differential Revision: D34220931
Pulled By: pdillinger
fbshipit-source-id: d39436008259bd1eaaa87c77be69fb2a5b559e1f
2022-02-15 17:18:08 +00:00
|
|
|
{offsetof(struct ColumnFamilyOptions,
|
|
|
|
max_bytes_for_level_multiplier_additional),
|
2016-04-11 18:39:51 +00:00
|
|
|
sizeof(std::vector<int>)},
|
2023-05-11 23:40:59 +00:00
|
|
|
{offsetof(struct ColumnFamilyOptions, compaction_options_fifo),
|
|
|
|
sizeof(struct CompactionOptionsFIFO)},
|
Use -Wno-invalid-offsetof instead of dangerous offset_of hack (#9563)
Summary:
After https://github.com/facebook/rocksdb/issues/9515 added a unique_ptr to Status, we see some
warnings-as-error in some internal builds like this:
```
stderr: rocksdb/src/db/compaction/compaction_job.cc:2839:7: error:
offset of on non-standard-layout type 'struct CompactionServiceResult'
[-Werror,-Winvalid-offsetof]
{offsetof(struct CompactionServiceResult, status),
^ ~~~~~~
```
I see three potential solutions to resolving this:
* Expand our use of an idiom that works around the warning (see offset_of
functions removed in this change, inspired by
https://gist.github.com/graphitemaster/494f21190bb2c63c5516) However,
this construction is invoking undefined behavior that assumes consistent
layout with no compiler-introduced indirection. A compiler incompatible
with our assumptions will likely compile the code and exhibit undefined
behavior.
* Migrate to something in place of offset, like a function mapping
CompactionServiceResult* to Status* (for the `status` field). This might
be required in the long term.
* **Selected:** Use our new C++17 dependency to use offsetof in a well-defined way
when the compiler allows it. From a comment on
https://gist.github.com/graphitemaster/494f21190bb2c63c5516:
> A final note: in C++17, offsetof is conditionally supported, which
> means that you can use it on any type (not just standard layout
> types) and the compiler will error if it can't compile it correctly.
> That appears to be the best option if you can live with C++17 and
> don't need constexpr support.
The C++17 semantics are confirmed on
https://en.cppreference.com/w/cpp/types/offsetof, so we can suppress the
warning as long as we accept that we might run into a compiler that
rejects the code, and at that point we will find a solution, such as
the more intrusive "migrate" solution above.
Although this is currently only showing in our buck build, it will
surely show up also with make and cmake, so I have updated those
configurations as well.
Also in the buck build, -Wno-expansion-to-defined does not appear to be
needed anymore (both current compiler configurations) so I
removed it.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9563
Test Plan: Tried out buck builds with both current compiler configurations
Reviewed By: riversand963
Differential Revision: D34220931
Pulled By: pdillinger
fbshipit-source-id: d39436008259bd1eaaa87c77be69fb2a5b559e1f
2022-02-15 17:18:08 +00:00
|
|
|
{offsetof(struct ColumnFamilyOptions, memtable_factory),
|
2016-04-11 18:39:51 +00:00
|
|
|
sizeof(std::shared_ptr<MemTableRepFactory>)},
|
Use -Wno-invalid-offsetof instead of dangerous offset_of hack (#9563)
Summary:
After https://github.com/facebook/rocksdb/issues/9515 added a unique_ptr to Status, we see some
warnings-as-error in some internal builds like this:
```
stderr: rocksdb/src/db/compaction/compaction_job.cc:2839:7: error:
offset of on non-standard-layout type 'struct CompactionServiceResult'
[-Werror,-Winvalid-offsetof]
{offsetof(struct CompactionServiceResult, status),
^ ~~~~~~
```
I see three potential solutions to resolving this:
* Expand our use of an idiom that works around the warning (see offset_of
functions removed in this change, inspired by
https://gist.github.com/graphitemaster/494f21190bb2c63c5516) However,
this construction is invoking undefined behavior that assumes consistent
layout with no compiler-introduced indirection. A compiler incompatible
with our assumptions will likely compile the code and exhibit undefined
behavior.
* Migrate to something in place of offset, like a function mapping
CompactionServiceResult* to Status* (for the `status` field). This might
be required in the long term.
* **Selected:** Use our new C++17 dependency to use offsetof in a well-defined way
when the compiler allows it. From a comment on
https://gist.github.com/graphitemaster/494f21190bb2c63c5516:
> A final note: in C++17, offsetof is conditionally supported, which
> means that you can use it on any type (not just standard layout
> types) and the compiler will error if it can't compile it correctly.
> That appears to be the best option if you can live with C++17 and
> don't need constexpr support.
The C++17 semantics are confirmed on
https://en.cppreference.com/w/cpp/types/offsetof, so we can suppress the
warning as long as we accept that we might run into a compiler that
rejects the code, and at that point we will find a solution, such as
the more intrusive "migrate" solution above.
Although this is currently only showing in our buck build, it will
surely show up also with make and cmake, so I have updated those
configurations as well.
Also in the buck build, -Wno-expansion-to-defined does not appear to be
needed anymore (both current compiler configurations) so I
removed it.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9563
Test Plan: Tried out buck builds with both current compiler configurations
Reviewed By: riversand963
Differential Revision: D34220931
Pulled By: pdillinger
fbshipit-source-id: d39436008259bd1eaaa87c77be69fb2a5b559e1f
2022-02-15 17:18:08 +00:00
|
|
|
{offsetof(struct ColumnFamilyOptions,
|
|
|
|
table_properties_collector_factories),
|
2016-04-11 18:39:51 +00:00
|
|
|
sizeof(ColumnFamilyOptions::TablePropertiesCollectorFactories)},
|
2022-07-15 04:49:34 +00:00
|
|
|
{offsetof(struct ColumnFamilyOptions, preclude_last_level_data_seconds),
|
|
|
|
sizeof(uint64_t)},
|
2022-10-08 01:49:40 +00:00
|
|
|
{offsetof(struct ColumnFamilyOptions, preserve_internal_time_seconds),
|
|
|
|
sizeof(uint64_t)},
|
2022-06-14 21:19:26 +00:00
|
|
|
{offsetof(struct ColumnFamilyOptions, blob_cache),
|
|
|
|
sizeof(std::shared_ptr<Cache>)},
|
Use -Wno-invalid-offsetof instead of dangerous offset_of hack (#9563)
Summary:
After https://github.com/facebook/rocksdb/issues/9515 added a unique_ptr to Status, we see some
warnings-as-error in some internal builds like this:
```
stderr: rocksdb/src/db/compaction/compaction_job.cc:2839:7: error:
offset of on non-standard-layout type 'struct CompactionServiceResult'
[-Werror,-Winvalid-offsetof]
{offsetof(struct CompactionServiceResult, status),
^ ~~~~~~
```
I see three potential solutions to resolving this:
* Expand our use of an idiom that works around the warning (see offset_of
functions removed in this change, inspired by
https://gist.github.com/graphitemaster/494f21190bb2c63c5516) However,
this construction is invoking undefined behavior that assumes consistent
layout with no compiler-introduced indirection. A compiler incompatible
with our assumptions will likely compile the code and exhibit undefined
behavior.
* Migrate to something in place of offset, like a function mapping
CompactionServiceResult* to Status* (for the `status` field). This might
be required in the long term.
* **Selected:** Use our new C++17 dependency to use offsetof in a well-defined way
when the compiler allows it. From a comment on
https://gist.github.com/graphitemaster/494f21190bb2c63c5516:
> A final note: in C++17, offsetof is conditionally supported, which
> means that you can use it on any type (not just standard layout
> types) and the compiler will error if it can't compile it correctly.
> That appears to be the best option if you can live with C++17 and
> don't need constexpr support.
The C++17 semantics are confirmed on
https://en.cppreference.com/w/cpp/types/offsetof, so we can suppress the
warning as long as we accept that we might run into a compiler that
rejects the code, and at that point we will find a solution, such as
the more intrusive "migrate" solution above.
Although this is currently only showing in our buck build, it will
surely show up also with make and cmake, so I have updated those
configurations as well.
Also in the buck build, -Wno-expansion-to-defined does not appear to be
needed anymore (both current compiler configurations) so I
removed it.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9563
Test Plan: Tried out buck builds with both current compiler configurations
Reviewed By: riversand963
Differential Revision: D34220931
Pulled By: pdillinger
fbshipit-source-id: d39436008259bd1eaaa87c77be69fb2a5b559e1f
2022-02-15 17:18:08 +00:00
|
|
|
{offsetof(struct ColumnFamilyOptions, comparator), sizeof(Comparator*)},
|
|
|
|
{offsetof(struct ColumnFamilyOptions, merge_operator),
|
2017-02-28 01:36:06 +00:00
|
|
|
sizeof(std::shared_ptr<MergeOperator>)},
|
Use -Wno-invalid-offsetof instead of dangerous offset_of hack (#9563)
Summary:
After https://github.com/facebook/rocksdb/issues/9515 added a unique_ptr to Status, we see some
warnings-as-error in some internal builds like this:
```
stderr: rocksdb/src/db/compaction/compaction_job.cc:2839:7: error:
offset of on non-standard-layout type 'struct CompactionServiceResult'
[-Werror,-Winvalid-offsetof]
{offsetof(struct CompactionServiceResult, status),
^ ~~~~~~
```
I see three potential solutions to resolving this:
* Expand our use of an idiom that works around the warning (see offset_of
functions removed in this change, inspired by
https://gist.github.com/graphitemaster/494f21190bb2c63c5516) However,
this construction is invoking undefined behavior that assumes consistent
layout with no compiler-introduced indirection. A compiler incompatible
with our assumptions will likely compile the code and exhibit undefined
behavior.
* Migrate to something in place of offset, like a function mapping
CompactionServiceResult* to Status* (for the `status` field). This might
be required in the long term.
* **Selected:** Use our new C++17 dependency to use offsetof in a well-defined way
when the compiler allows it. From a comment on
https://gist.github.com/graphitemaster/494f21190bb2c63c5516:
> A final note: in C++17, offsetof is conditionally supported, which
> means that you can use it on any type (not just standard layout
> types) and the compiler will error if it can't compile it correctly.
> That appears to be the best option if you can live with C++17 and
> don't need constexpr support.
The C++17 semantics are confirmed on
https://en.cppreference.com/w/cpp/types/offsetof, so we can suppress the
warning as long as we accept that we might run into a compiler that
rejects the code, and at that point we will find a solution, such as
the more intrusive "migrate" solution above.
Although this is currently only showing in our buck build, it will
surely show up also with make and cmake, so I have updated those
configurations as well.
Also in the buck build, -Wno-expansion-to-defined does not appear to be
needed anymore (both current compiler configurations) so I
removed it.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9563
Test Plan: Tried out buck builds with both current compiler configurations
Reviewed By: riversand963
Differential Revision: D34220931
Pulled By: pdillinger
fbshipit-source-id: d39436008259bd1eaaa87c77be69fb2a5b559e1f
2022-02-15 17:18:08 +00:00
|
|
|
{offsetof(struct ColumnFamilyOptions, compaction_filter),
|
2017-02-28 01:36:06 +00:00
|
|
|
sizeof(const CompactionFilter*)},
|
Use -Wno-invalid-offsetof instead of dangerous offset_of hack (#9563)
Summary:
After https://github.com/facebook/rocksdb/issues/9515 added a unique_ptr to Status, we see some
warnings-as-error in some internal builds like this:
```
stderr: rocksdb/src/db/compaction/compaction_job.cc:2839:7: error:
offset of on non-standard-layout type 'struct CompactionServiceResult'
[-Werror,-Winvalid-offsetof]
{offsetof(struct CompactionServiceResult, status),
^ ~~~~~~
```
I see three potential solutions to resolving this:
* Expand our use of an idiom that works around the warning (see offset_of
functions removed in this change, inspired by
https://gist.github.com/graphitemaster/494f21190bb2c63c5516) However,
this construction is invoking undefined behavior that assumes consistent
layout with no compiler-introduced indirection. A compiler incompatible
with our assumptions will likely compile the code and exhibit undefined
behavior.
* Migrate to something in place of offset, like a function mapping
CompactionServiceResult* to Status* (for the `status` field). This might
be required in the long term.
* **Selected:** Use our new C++17 dependency to use offsetof in a well-defined way
when the compiler allows it. From a comment on
https://gist.github.com/graphitemaster/494f21190bb2c63c5516:
> A final note: in C++17, offsetof is conditionally supported, which
> means that you can use it on any type (not just standard layout
> types) and the compiler will error if it can't compile it correctly.
> That appears to be the best option if you can live with C++17 and
> don't need constexpr support.
The C++17 semantics are confirmed on
https://en.cppreference.com/w/cpp/types/offsetof, so we can suppress the
warning as long as we accept that we might run into a compiler that
rejects the code, and at that point we will find a solution, such as
the more intrusive "migrate" solution above.
Although this is currently only showing in our buck build, it will
surely show up also with make and cmake, so I have updated those
configurations as well.
Also in the buck build, -Wno-expansion-to-defined does not appear to be
needed anymore (both current compiler configurations) so I
removed it.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9563
Test Plan: Tried out buck builds with both current compiler configurations
Reviewed By: riversand963
Differential Revision: D34220931
Pulled By: pdillinger
fbshipit-source-id: d39436008259bd1eaaa87c77be69fb2a5b559e1f
2022-02-15 17:18:08 +00:00
|
|
|
{offsetof(struct ColumnFamilyOptions, compaction_filter_factory),
|
2017-02-28 01:36:06 +00:00
|
|
|
sizeof(std::shared_ptr<CompactionFilterFactory>)},
|
Use -Wno-invalid-offsetof instead of dangerous offset_of hack (#9563)
Summary:
After https://github.com/facebook/rocksdb/issues/9515 added a unique_ptr to Status, we see some
warnings-as-error in some internal builds like this:
```
stderr: rocksdb/src/db/compaction/compaction_job.cc:2839:7: error:
offset of on non-standard-layout type 'struct CompactionServiceResult'
[-Werror,-Winvalid-offsetof]
{offsetof(struct CompactionServiceResult, status),
^ ~~~~~~
```
I see three potential solutions to resolving this:
* Expand our use of an idiom that works around the warning (see offset_of
functions removed in this change, inspired by
https://gist.github.com/graphitemaster/494f21190bb2c63c5516) However,
this construction is invoking undefined behavior that assumes consistent
layout with no compiler-introduced indirection. A compiler incompatible
with our assumptions will likely compile the code and exhibit undefined
behavior.
* Migrate to something in place of offset, like a function mapping
CompactionServiceResult* to Status* (for the `status` field). This might
be required in the long term.
* **Selected:** Use our new C++17 dependency to use offsetof in a well-defined way
when the compiler allows it. From a comment on
https://gist.github.com/graphitemaster/494f21190bb2c63c5516:
> A final note: in C++17, offsetof is conditionally supported, which
> means that you can use it on any type (not just standard layout
> types) and the compiler will error if it can't compile it correctly.
> That appears to be the best option if you can live with C++17 and
> don't need constexpr support.
The C++17 semantics are confirmed on
https://en.cppreference.com/w/cpp/types/offsetof, so we can suppress the
warning as long as we accept that we might run into a compiler that
rejects the code, and at that point we will find a solution, such as
the more intrusive "migrate" solution above.
Although this is currently only showing in our buck build, it will
surely show up also with make and cmake, so I have updated those
configurations as well.
Also in the buck build, -Wno-expansion-to-defined does not appear to be
needed anymore (both current compiler configurations) so I
removed it.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9563
Test Plan: Tried out buck builds with both current compiler configurations
Reviewed By: riversand963
Differential Revision: D34220931
Pulled By: pdillinger
fbshipit-source-id: d39436008259bd1eaaa87c77be69fb2a5b559e1f
2022-02-15 17:18:08 +00:00
|
|
|
{offsetof(struct ColumnFamilyOptions, prefix_extractor),
|
2016-11-14 02:58:17 +00:00
|
|
|
sizeof(std::shared_ptr<const SliceTransform>)},
|
Use -Wno-invalid-offsetof instead of dangerous offset_of hack (#9563)
Summary:
After https://github.com/facebook/rocksdb/issues/9515 added a unique_ptr to Status, we see some
warnings-as-error in some internal builds like this:
```
stderr: rocksdb/src/db/compaction/compaction_job.cc:2839:7: error:
offset of on non-standard-layout type 'struct CompactionServiceResult'
[-Werror,-Winvalid-offsetof]
{offsetof(struct CompactionServiceResult, status),
^ ~~~~~~
```
I see three potential solutions to resolving this:
* Expand our use of an idiom that works around the warning (see offset_of
functions removed in this change, inspired by
https://gist.github.com/graphitemaster/494f21190bb2c63c5516) However,
this construction is invoking undefined behavior that assumes consistent
layout with no compiler-introduced indirection. A compiler incompatible
with our assumptions will likely compile the code and exhibit undefined
behavior.
* Migrate to something in place of offset, like a function mapping
CompactionServiceResult* to Status* (for the `status` field). This might
be required in the long term.
* **Selected:** Use our new C++17 dependency to use offsetof in a well-defined way
when the compiler allows it. From a comment on
https://gist.github.com/graphitemaster/494f21190bb2c63c5516:
> A final note: in C++17, offsetof is conditionally supported, which
> means that you can use it on any type (not just standard layout
> types) and the compiler will error if it can't compile it correctly.
> That appears to be the best option if you can live with C++17 and
> don't need constexpr support.
The C++17 semantics are confirmed on
https://en.cppreference.com/w/cpp/types/offsetof, so we can suppress the
warning as long as we accept that we might run into a compiler that
rejects the code, and at that point we will find a solution, such as
the more intrusive "migrate" solution above.
Although this is currently only showing in our buck build, it will
surely show up also with make and cmake, so I have updated those
configurations as well.
Also in the buck build, -Wno-expansion-to-defined does not appear to be
needed anymore (both current compiler configurations) so I
removed it.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9563
Test Plan: Tried out buck builds with both current compiler configurations
Reviewed By: riversand963
Differential Revision: D34220931
Pulled By: pdillinger
fbshipit-source-id: d39436008259bd1eaaa87c77be69fb2a5b559e1f
2022-02-15 17:18:08 +00:00
|
|
|
{offsetof(struct ColumnFamilyOptions, snap_refresh_nanos),
|
|
|
|
sizeof(uint64_t)},
|
|
|
|
{offsetof(struct ColumnFamilyOptions, table_factory),
|
2017-02-28 01:36:06 +00:00
|
|
|
sizeof(std::shared_ptr<TableFactory>)},
|
Use -Wno-invalid-offsetof instead of dangerous offset_of hack (#9563)
Summary:
After https://github.com/facebook/rocksdb/issues/9515 added a unique_ptr to Status, we see some
warnings-as-error in some internal builds like this:
```
stderr: rocksdb/src/db/compaction/compaction_job.cc:2839:7: error:
offset of on non-standard-layout type 'struct CompactionServiceResult'
[-Werror,-Winvalid-offsetof]
{offsetof(struct CompactionServiceResult, status),
^ ~~~~~~
```
I see three potential solutions to resolving this:
* Expand our use of an idiom that works around the warning (see offset_of
functions removed in this change, inspired by
https://gist.github.com/graphitemaster/494f21190bb2c63c5516) However,
this construction is invoking undefined behavior that assumes consistent
layout with no compiler-introduced indirection. A compiler incompatible
with our assumptions will likely compile the code and exhibit undefined
behavior.
* Migrate to something in place of offset, like a function mapping
CompactionServiceResult* to Status* (for the `status` field). This might
be required in the long term.
* **Selected:** Use our new C++17 dependency to use offsetof in a well-defined way
when the compiler allows it. From a comment on
https://gist.github.com/graphitemaster/494f21190bb2c63c5516:
> A final note: in C++17, offsetof is conditionally supported, which
> means that you can use it on any type (not just standard layout
> types) and the compiler will error if it can't compile it correctly.
> That appears to be the best option if you can live with C++17 and
> don't need constexpr support.
The C++17 semantics are confirmed on
https://en.cppreference.com/w/cpp/types/offsetof, so we can suppress the
warning as long as we accept that we might run into a compiler that
rejects the code, and at that point we will find a solution, such as
the more intrusive "migrate" solution above.
Although this is currently only showing in our buck build, it will
surely show up also with make and cmake, so I have updated those
configurations as well.
Also in the buck build, -Wno-expansion-to-defined does not appear to be
needed anymore (both current compiler configurations) so I
removed it.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9563
Test Plan: Tried out buck builds with both current compiler configurations
Reviewed By: riversand963
Differential Revision: D34220931
Pulled By: pdillinger
fbshipit-source-id: d39436008259bd1eaaa87c77be69fb2a5b559e1f
2022-02-15 17:18:08 +00:00
|
|
|
{offsetof(struct ColumnFamilyOptions, cf_paths),
|
|
|
|
sizeof(std::vector<DbPath>)},
|
|
|
|
{offsetof(struct ColumnFamilyOptions, compaction_thread_limiter),
|
Concurrent task limiter for compaction thread control (#4332)
Summary:
The PR is targeting to resolve the issue of:
https://github.com/facebook/rocksdb/issues/3972#issue-330771918
We have a rocksdb created with leveled-compaction with multiple column families (CFs), some of CFs are using HDD to store big and less frequently accessed data and others are using SSD.
When there are continuously write traffics going on to all CFs, the compaction thread pool is mostly occupied by those slow HDD compactions, which blocks fully utilize SSD bandwidth.
Since atomic write and transaction is needed across CFs, so splitting it to multiple rocksdb instance is not an option for us.
With the compaction thread control, we got 30%+ HDD write throughput gain, and also a lot smooth SSD write since less write stall happening.
ConcurrentTaskLimiter can be shared with multi-CFs across rocksdb instances, so the feature does not only work for multi-CFs scenarios, but also for multi-rocksdbs scenarios, who need disk IO resource control per tenant.
The usage is straight forward:
e.g.:
//
// Enable compaction thread limiter thru ColumnFamilyOptions
//
std::shared_ptr<ConcurrentTaskLimiter> ctl(NewConcurrentTaskLimiter("foo_limiter", 4));
Options options;
ColumnFamilyOptions cf_opt(options);
cf_opt.compaction_thread_limiter = ctl;
...
//
// Compaction thread limiter can be tuned or disabled on-the-fly
//
ctl->SetMaxOutstandingTask(12); // enlarge to 12 tasks
...
ctl->ResetMaxOutstandingTask(); // disable (bypass) thread limiter
ctl->SetMaxOutstandingTask(-1); // Same as above
...
ctl->SetMaxOutstandingTask(0); // full throttle (0 task)
//
// Sharing compaction thread limiter among CFs (to resolve multiple storage perf issue)
//
std::shared_ptr<ConcurrentTaskLimiter> ctl_ssd(NewConcurrentTaskLimiter("ssd_limiter", 8));
std::shared_ptr<ConcurrentTaskLimiter> ctl_hdd(NewConcurrentTaskLimiter("hdd_limiter", 4));
Options options;
ColumnFamilyOptions cf_opt_ssd1(options);
ColumnFamilyOptions cf_opt_ssd2(options);
ColumnFamilyOptions cf_opt_hdd1(options);
ColumnFamilyOptions cf_opt_hdd2(options);
ColumnFamilyOptions cf_opt_hdd3(options);
// SSD CFs
cf_opt_ssd1.compaction_thread_limiter = ctl_ssd;
cf_opt_ssd2.compaction_thread_limiter = ctl_ssd;
// HDD CFs
cf_opt_hdd1.compaction_thread_limiter = ctl_hdd;
cf_opt_hdd2.compaction_thread_limiter = ctl_hdd;
cf_opt_hdd3.compaction_thread_limiter = ctl_hdd;
...
//
// The limiter is disabled by default (or set to nullptr explicitly)
//
Options options;
ColumnFamilyOptions cf_opt(options);
cf_opt.compaction_thread_limiter = nullptr;
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4332
Differential Revision: D13226590
Pulled By: siying
fbshipit-source-id: 14307aec55b8bd59c8223d04aa6db3c03d1b0c1d
2018-12-13 21:16:04 +00:00
|
|
|
sizeof(std::shared_ptr<ConcurrentTaskLimiter>)},
|
Use -Wno-invalid-offsetof instead of dangerous offset_of hack (#9563)
Summary:
After https://github.com/facebook/rocksdb/issues/9515 added a unique_ptr to Status, we see some
warnings-as-error in some internal builds like this:
```
stderr: rocksdb/src/db/compaction/compaction_job.cc:2839:7: error:
offset of on non-standard-layout type 'struct CompactionServiceResult'
[-Werror,-Winvalid-offsetof]
{offsetof(struct CompactionServiceResult, status),
^ ~~~~~~
```
I see three potential solutions to resolving this:
* Expand our use of an idiom that works around the warning (see offset_of
functions removed in this change, inspired by
https://gist.github.com/graphitemaster/494f21190bb2c63c5516) However,
this construction is invoking undefined behavior that assumes consistent
layout with no compiler-introduced indirection. A compiler incompatible
with our assumptions will likely compile the code and exhibit undefined
behavior.
* Migrate to something in place of offset, like a function mapping
CompactionServiceResult* to Status* (for the `status` field). This might
be required in the long term.
* **Selected:** Use our new C++17 dependency to use offsetof in a well-defined way
when the compiler allows it. From a comment on
https://gist.github.com/graphitemaster/494f21190bb2c63c5516:
> A final note: in C++17, offsetof is conditionally supported, which
> means that you can use it on any type (not just standard layout
> types) and the compiler will error if it can't compile it correctly.
> That appears to be the best option if you can live with C++17 and
> don't need constexpr support.
The C++17 semantics are confirmed on
https://en.cppreference.com/w/cpp/types/offsetof, so we can suppress the
warning as long as we accept that we might run into a compiler that
rejects the code, and at that point we will find a solution, such as
the more intrusive "migrate" solution above.
Although this is currently only showing in our buck build, it will
surely show up also with make and cmake, so I have updated those
configurations as well.
Also in the buck build, -Wno-expansion-to-defined does not appear to be
needed anymore (both current compiler configurations) so I
removed it.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9563
Test Plan: Tried out buck builds with both current compiler configurations
Reviewed By: riversand963
Differential Revision: D34220931
Pulled By: pdillinger
fbshipit-source-id: d39436008259bd1eaaa87c77be69fb2a5b559e1f
2022-02-15 17:18:08 +00:00
|
|
|
{offsetof(struct ColumnFamilyOptions, sst_partitioner_factory),
|
2020-07-24 20:43:14 +00:00
|
|
|
sizeof(std::shared_ptr<SstPartitionerFactory>)},
|
2016-04-11 18:39:51 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
char* options_ptr = new char[sizeof(ColumnFamilyOptions)];
|
|
|
|
|
|
|
|
// Count padding bytes by setting all bytes in the memory to a special char,
|
|
|
|
// copy a well constructed struct to this memory and see how many special
|
|
|
|
// bytes left.
|
|
|
|
FillWithSpecialChar(options_ptr, sizeof(ColumnFamilyOptions),
|
2020-06-19 22:26:05 +00:00
|
|
|
kColumnFamilyOptionsExcluded);
|
2020-04-08 21:37:01 +00:00
|
|
|
|
Limit buffering for collecting samples for compression dictionary (#7970)
Summary:
For dictionary compression, we need to collect some representative samples of the data to be compressed, which we use to either generate or train (when `CompressionOptions::zstd_max_train_bytes > 0`) a dictionary. Previously, the strategy was to buffer all the data blocks during flush, and up to the target file size during compaction. That strategy allowed us to randomly pick samples from as wide a range as possible that'd be guaranteed to land in a single output file.
However, some users try to make huge files in memory-constrained environments, where this strategy can cause OOM. This PR introduces an option, `CompressionOptions::max_dict_buffer_bytes`, that limits how much data blocks are buffered before we switch to unbuffered mode (which means creating the per-SST dictionary, writing out the buffered data, and compressing/writing new blocks as soon as they are built). It is not strict as we currently buffer more than just data blocks -- also keys are buffered. But it does make a step towards giving users predictable memory usage.
Related changes include:
- Changed sampling for dictionary compression to select unique data blocks when there is limited availability of data blocks
- Made use of `BlockBuilder::SwapAndReset()` to save an allocation+memcpy when buffering data blocks for building a dictionary
- Changed `ParseBoolean()` to accept an input containing characters after the boolean. This is necessary since, with this PR, a value for `CompressionOptions::enabled` is no longer necessarily the final component in the `CompressionOptions` string.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7970
Test Plan:
- updated `CompressionOptions` unit tests to verify limit is respected (to the extent expected in the current implementation) in various scenarios of flush/compaction to bottommost/non-bottommost level
- looked at jemalloc heap profiles right before and after switching to unbuffered mode during flush/compaction. Verified memory usage in buffering is proportional to the limit set.
Reviewed By: pdillinger
Differential Revision: D26467994
Pulled By: ajkr
fbshipit-source-id: 3da4ef9fba59974e4ef40e40c01611002c861465
2021-02-19 22:06:59 +00:00
|
|
|
// Invoke a user-defined constructor in the hope that it does not overwrite
|
|
|
|
// padding bytes. Note that previously we relied on the implicitly-defined
|
|
|
|
// copy-assignment operator (i.e., `*options = ColumnFamilyOptions();`) here,
|
|
|
|
// which did in fact modify padding bytes.
|
|
|
|
ColumnFamilyOptions* options = new (options_ptr) ColumnFamilyOptions();
|
2016-04-11 18:39:51 +00:00
|
|
|
|
|
|
|
int unset_bytes_base = NumUnsetBytes(options_ptr, sizeof(ColumnFamilyOptions),
|
2020-06-19 22:26:05 +00:00
|
|
|
kColumnFamilyOptionsExcluded);
|
2016-04-11 18:39:51 +00:00
|
|
|
ASSERT_GT(unset_bytes_base, 0);
|
|
|
|
options->~ColumnFamilyOptions();
|
|
|
|
|
|
|
|
options = new (options_ptr) ColumnFamilyOptions();
|
|
|
|
FillWithSpecialChar(options_ptr, sizeof(ColumnFamilyOptions),
|
2020-06-19 22:26:05 +00:00
|
|
|
kColumnFamilyOptionsExcluded);
|
2016-04-11 18:39:51 +00:00
|
|
|
|
|
|
|
// Following options are not settable through
|
|
|
|
// GetColumnFamilyOptionsFromString():
|
|
|
|
options->compaction_options_universal = CompactionOptionsUniversal();
|
2020-09-14 23:59:00 +00:00
|
|
|
options->num_levels = 42; // Initialize options for MutableCF
|
Concurrent task limiter for compaction thread control (#4332)
Summary:
The PR is targeting to resolve the issue of:
https://github.com/facebook/rocksdb/issues/3972#issue-330771918
We have a rocksdb created with leveled-compaction with multiple column families (CFs), some of CFs are using HDD to store big and less frequently accessed data and others are using SSD.
When there are continuously write traffics going on to all CFs, the compaction thread pool is mostly occupied by those slow HDD compactions, which blocks fully utilize SSD bandwidth.
Since atomic write and transaction is needed across CFs, so splitting it to multiple rocksdb instance is not an option for us.
With the compaction thread control, we got 30%+ HDD write throughput gain, and also a lot smooth SSD write since less write stall happening.
ConcurrentTaskLimiter can be shared with multi-CFs across rocksdb instances, so the feature does not only work for multi-CFs scenarios, but also for multi-rocksdbs scenarios, who need disk IO resource control per tenant.
The usage is straight forward:
e.g.:
//
// Enable compaction thread limiter thru ColumnFamilyOptions
//
std::shared_ptr<ConcurrentTaskLimiter> ctl(NewConcurrentTaskLimiter("foo_limiter", 4));
Options options;
ColumnFamilyOptions cf_opt(options);
cf_opt.compaction_thread_limiter = ctl;
...
//
// Compaction thread limiter can be tuned or disabled on-the-fly
//
ctl->SetMaxOutstandingTask(12); // enlarge to 12 tasks
...
ctl->ResetMaxOutstandingTask(); // disable (bypass) thread limiter
ctl->SetMaxOutstandingTask(-1); // Same as above
...
ctl->SetMaxOutstandingTask(0); // full throttle (0 task)
//
// Sharing compaction thread limiter among CFs (to resolve multiple storage perf issue)
//
std::shared_ptr<ConcurrentTaskLimiter> ctl_ssd(NewConcurrentTaskLimiter("ssd_limiter", 8));
std::shared_ptr<ConcurrentTaskLimiter> ctl_hdd(NewConcurrentTaskLimiter("hdd_limiter", 4));
Options options;
ColumnFamilyOptions cf_opt_ssd1(options);
ColumnFamilyOptions cf_opt_ssd2(options);
ColumnFamilyOptions cf_opt_hdd1(options);
ColumnFamilyOptions cf_opt_hdd2(options);
ColumnFamilyOptions cf_opt_hdd3(options);
// SSD CFs
cf_opt_ssd1.compaction_thread_limiter = ctl_ssd;
cf_opt_ssd2.compaction_thread_limiter = ctl_ssd;
// HDD CFs
cf_opt_hdd1.compaction_thread_limiter = ctl_hdd;
cf_opt_hdd2.compaction_thread_limiter = ctl_hdd;
cf_opt_hdd3.compaction_thread_limiter = ctl_hdd;
...
//
// The limiter is disabled by default (or set to nullptr explicitly)
//
Options options;
ColumnFamilyOptions cf_opt(options);
cf_opt.compaction_thread_limiter = nullptr;
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4332
Differential Revision: D13226590
Pulled By: siying
fbshipit-source-id: 14307aec55b8bd59c8223d04aa6db3c03d1b0c1d
2018-12-13 21:16:04 +00:00
|
|
|
options->compaction_filter = nullptr;
|
2020-07-24 20:43:14 +00:00
|
|
|
options->sst_partitioner_factory = nullptr;
|
2016-04-11 18:39:51 +00:00
|
|
|
|
|
|
|
char* new_options_ptr = new char[sizeof(ColumnFamilyOptions)];
|
|
|
|
ColumnFamilyOptions* new_options =
|
|
|
|
new (new_options_ptr) ColumnFamilyOptions();
|
|
|
|
FillWithSpecialChar(new_options_ptr, sizeof(ColumnFamilyOptions),
|
2020-06-19 22:26:05 +00:00
|
|
|
kColumnFamilyOptionsExcluded);
|
2016-04-11 18:39:51 +00:00
|
|
|
|
|
|
|
// Need to update the option string if a new option is added.
|
2023-02-07 22:11:53 +00:00
|
|
|
ConfigOptions config_options;
|
|
|
|
config_options.input_strings_escaped = false;
|
|
|
|
config_options.ignore_unknown_options = false;
|
2016-04-11 18:39:51 +00:00
|
|
|
ASSERT_OK(GetColumnFamilyOptionsFromString(
|
2023-02-07 22:11:53 +00:00
|
|
|
config_options, *options,
|
2016-04-11 18:39:51 +00:00
|
|
|
"compaction_filter_factory=mpudlojcujCompactionFilterFactory;"
|
|
|
|
"table_factory=PlainTable;"
|
|
|
|
"prefix_extractor=rocksdb.CappedPrefix.13;"
|
|
|
|
"comparator=leveldb.BytewiseComparator;"
|
|
|
|
"compression_per_level=kBZip2Compression:kBZip2Compression:"
|
|
|
|
"kBZip2Compression:kNoCompression:kZlibCompression:kBZip2Compression:"
|
|
|
|
"kSnappyCompression;"
|
|
|
|
"max_bytes_for_level_base=986;"
|
|
|
|
"bloom_locality=8016;"
|
|
|
|
"target_file_size_base=4294976376;"
|
2016-07-27 01:05:30 +00:00
|
|
|
"memtable_huge_page_size=2557;"
|
2016-04-11 18:39:51 +00:00
|
|
|
"max_successive_merges=5497;"
|
2024-02-21 21:15:27 +00:00
|
|
|
"strict_max_successive_merges=true;"
|
2016-04-11 18:39:51 +00:00
|
|
|
"max_sequential_skip_in_iterations=4294971408;"
|
|
|
|
"arena_block_size=1893;"
|
|
|
|
"target_file_size_multiplier=35;"
|
|
|
|
"min_write_buffer_number_to_merge=9;"
|
|
|
|
"max_write_buffer_number=84;"
|
|
|
|
"write_buffer_size=1653;"
|
2016-06-16 23:02:52 +00:00
|
|
|
"max_compaction_bytes=64;"
|
Ignore max_compaction_bytes for compaction input that are within output key-range (#10835)
Summary:
When picking compaction input files, we sometimes stop picking a file that is fully included in the output key-range due to hitting max_compaction_bytes. Including these input files can potentially reduce WA at the expense of larger compactions. Larger compaction should be fine as files from input level are usually 10X smaller than files from output level. This PR adds a mutable CF option `ignore_max_compaction_bytes_for_input` that is enabled by default. We can remove this option once we are sure it is safe.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10835
Test Plan:
- CI, a unit test on max_compaction_bytes fails before turning this flag off.
- Benchmark does not show much difference in WA: `./db_bench --benchmarks=fillrandom,waitforcompaction,stats,levelstats -max_background_jobs=12 -num=2000000000 -target_file_size_base=33554432 --write_buffer_size=33554432`
```
main:
** Compaction Stats [default] **
Level Files Size Score Read(GB) Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) Moved(GB) W-Amp Rd(MB/s) Wr(MB/s) Comp(sec) CompMergeCPU(sec) Comp(cnt) Avg(sec) KeyIn KeyDrop Rblob(GB) Wblob(GB)
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
L0 3/0 91.59 MB 0.8 70.9 0.0 70.9 200.8 129.9 0.0 1.5 25.2 71.2 2886.55 2463.45 9725 0.297 1093M 254K 0.0 0.0
L1 9/0 248.03 MB 1.0 392.0 129.8 262.2 391.7 129.5 0.0 3.0 69.0 68.9 5821.71 5536.90 804 7.241 6029M 5814K 0.0 0.0
L2 87/0 2.50 GB 1.0 537.0 128.5 408.5 533.8 125.2 0.7 4.2 69.5 69.1 7912.24 7323.70 4417 1.791 8299M 36M 0.0 0.0
L3 836/0 24.99 GB 1.0 616.9 118.3 498.7 594.5 95.8 5.2 5.0 66.9 64.5 9442.38 8490.28 4204 2.246 9749M 306M 0.0 0.0
L4 2355/0 62.95 GB 0.3 67.3 37.1 30.2 54.2 24.0 38.9 1.5 72.2 58.2 954.37 821.18 917 1.041 1076M 173M 0.0 0.0
Sum 3290/0 90.77 GB 0.0 1684.2 413.7 1270.5 1775.0 504.5 44.9 13.7 63.8 67.3 27017.25 24635.52 20067 1.346 26G 522M 0.0 0.0
Cumulative compaction: 1774.96 GB write, 154.29 MB/s write, 1684.19 GB read, 146.40 MB/s read, 27017.3 seconds
This PR:
** Compaction Stats [default] **
Level Files Size Score Read(GB) Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) Moved(GB) W-Amp Rd(MB/s) Wr(MB/s) Comp(sec) CompMergeCPU(sec) Comp(cnt) Avg(sec) KeyIn KeyDrop Rblob(GB) Wblob(GB)
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
L0 3/0 45.71 MB 0.8 72.9 0.0 72.9 202.8 129.9 0.0 1.6 25.4 70.7 2938.16 2510.36 9741 0.302 1124M 265K 0.0 0.0
L1 8/0 234.54 MB 0.9 384.5 129.8 254.7 384.2 129.6 0.0 3.0 69.0 68.9 5708.08 5424.43 791 7.216 5913M 5753K 0.0 0.0
L2 84/0 2.47 GB 1.0 543.1 128.6 414.5 539.9 125.4 0.7 4.2 69.6 69.2 7989.31 7403.13 4418 1.808 8393M 36M 0.0 0.0
L3 839/0 24.96 GB 1.0 615.6 118.4 497.2 593.2 96.0 5.1 5.0 66.6 64.1 9471.23 8489.31 4193 2.259 9726M 306M 0.0 0.0
L4 2360/0 63.04 GB 0.3 67.6 37.3 30.3 54.4 24.1 38.9 1.5 71.5 57.6 967.30 827.99 907 1.066 1080M 173M 0.0 0.0
Sum 3294/0 90.75 GB 0.0 1683.8 414.2 1269.6 1774.5 504.9 44.8 13.7 63.7 67.1 27074.08 24655.22 20050 1.350 26G 522M 0.0 0.0
Cumulative compaction: 1774.52 GB write, 157.09 MB/s write, 1683.77 GB read, 149.06 MB/s read, 27074.1 seconds
```
Reviewed By: ajkr
Differential Revision: D40518319
Pulled By: cbi42
fbshipit-source-id: f4ea614bc0ebefe007ffaf05bb9aec9a8ca25b60
2022-10-21 17:22:41 +00:00
|
|
|
"ignore_max_compaction_bytes_for_input=true;"
|
2016-04-11 18:39:51 +00:00
|
|
|
"max_bytes_for_level_multiplier=60;"
|
|
|
|
"memtable_factory=SkipListFactory;"
|
|
|
|
"compression=kNoCompression;"
|
2023-04-22 04:57:40 +00:00
|
|
|
"compression_opts={max_dict_buffer_bytes=5;use_zstd_dict_trainer=true;"
|
|
|
|
"enabled=false;parallel_threads=6;zstd_max_train_bytes=7;strategy=8;max_"
|
Add `CompressionOptions::checksum` for enabling ZSTD checksum (#11666)
Summary:
Optionally enable zstd checksum flag (https://github.com/facebook/zstd/blob/d857369028d997c92ff1f1861a4d7f679a125464/lib/zstd.h#L428) to detect corruption during decompression. Main changes are in compression.h:
* User can set CompressionOptions::checksum to true to enable this feature.
* We enable this feature in ZSTD by setting the checksum flag in ZSTD compression context: `ZSTD_CCtx`.
* Uses `ZSTD_compress2()` to do compression since it supports frame parameter like the checksum flag. Compression level is also set in compression context as a flag.
* Error handling during decompression to propagate error message from ZSTD.
* Updated microbench to test read performance impact.
About compatibility, the current compression decoders should continue to work with the data created by the new compression API `ZSTD_compress2()`: https://github.com/facebook/zstd/issues/3711.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11666
Test Plan:
* Existing unit tests for zstd compression
* Add unit test `DBTest2.ZSTDChecksum` to test the corruption case
* Manually tested that compression levels, parallel compression, dictionary compression, index compression all work with the new ZSTD_compress2() API.
* Manually tested with `sst_dump --command=recompress` that different compression levels and dictionary compression settings all work.
* Manually tested compiling with older versions of ZSTD: v1.3.8, v1.1.0, v0.6.2.
* Perf impact: from public benchmark data: http://fastcompression.blogspot.com/2019/03/presenting-xxh3.html for checksum and https://github.com/facebook/zstd#benchmarks, if decompression is 1700MB/s and checksum computation is 70000MB/s, checksum computation is an additional ~2.4% time for decompression. Compression is slower and checksumming should be less noticeable.
* Microbench:
```
TEST_TMPDIR=/dev/shm ./branch_db_basic_bench --benchmark_filter=DBGet/comp_style:0/max_data:1048576/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:0/mmap:0/compression_type:7/compression_checksum:1/no_blockcache:1/iterations:10000/threads:1 --benchmark_repetitions=100
Min out of 100 runs:
Main:
10390 10436 10456 10484 10499 10535 10544 10545 10565 10568
After this PR, checksum=false
10285 10397 10503 10508 10515 10557 10562 10635 10640 10660
After this PR, checksum=true
10827 10876 10925 10949 10971 11052 11061 11063 11100 11109
```
* db_bench:
```
Write perf
TEST_TMPDIR=/dev/shm/ ./db_bench_ichecksum --benchmarks=fillseq[-X10] --compression_type=zstd --num=10000000 --compression_checksum=..
[FillSeq checksum=0]
fillseq [AVG 10 runs] : 281635 (± 31711) ops/sec; 31.2 (± 3.5) MB/sec
fillseq [MEDIAN 10 runs] : 294027 ops/sec; 32.5 MB/sec
[FillSeq checksum=1]
fillseq [AVG 10 runs] : 286961 (± 34700) ops/sec; 31.7 (± 3.8) MB/sec
fillseq [MEDIAN 10 runs] : 283278 ops/sec; 31.3 MB/sec
Read perf
TEST_TMPDIR=/dev/shm ./db_bench_ichecksum --benchmarks=readrandom[-X20] --num=100000000 --reads=1000000 --use_existing_db=true --readonly=1
[Readrandom checksum=1]
readrandom [AVG 20 runs] : 360928 (± 3579) ops/sec; 4.0 (± 0.0) MB/sec
readrandom [MEDIAN 20 runs] : 362468 ops/sec; 4.0 MB/sec
[Readrandom checksum=0]
readrandom [AVG 20 runs] : 380365 (± 2384) ops/sec; 4.2 (± 0.0) MB/sec
readrandom [MEDIAN 20 runs] : 379800 ops/sec; 4.2 MB/sec
Compression
TEST_TMPDIR=/dev/shm ./db_bench_ichecksum --benchmarks=compress[-X20] --compression_type=zstd --num=100000000 --compression_checksum=1
checksum=1
compress [AVG 20 runs] : 54074 (± 634) ops/sec; 211.2 (± 2.5) MB/sec
compress [MEDIAN 20 runs] : 54396 ops/sec; 212.5 MB/sec
checksum=0
compress [AVG 20 runs] : 54598 (± 393) ops/sec; 213.3 (± 1.5) MB/sec
compress [MEDIAN 20 runs] : 54592 ops/sec; 213.3 MB/sec
Decompression:
TEST_TMPDIR=/dev/shm ./db_bench_ichecksum --benchmarks=uncompress[-X20] --compression_type=zstd --compression_checksum=1
checksum = 0
uncompress [AVG 20 runs] : 167499 (± 962) ops/sec; 654.3 (± 3.8) MB/sec
uncompress [MEDIAN 20 runs] : 167210 ops/sec; 653.2 MB/sec
checksum = 1
uncompress [AVG 20 runs] : 167980 (± 924) ops/sec; 656.2 (± 3.6) MB/sec
uncompress [MEDIAN 20 runs] : 168465 ops/sec; 658.1 MB/sec
```
Reviewed By: ajkr
Differential Revision: D48019378
Pulled By: cbi42
fbshipit-source-id: 674120c6e1853c2ced1436ac8138559d0204feba
2023-08-18 22:01:59 +00:00
|
|
|
"dict_bytes=9;level=10;window_bits=11;max_compressed_bytes_per_kb=987;"
|
|
|
|
"checksum=true};"
|
2023-04-22 04:57:40 +00:00
|
|
|
"bottommost_compression_opts={max_dict_buffer_bytes=4;use_zstd_dict_"
|
|
|
|
"trainer=true;enabled=true;parallel_threads=5;zstd_max_train_bytes=6;"
|
|
|
|
"strategy=7;max_dict_bytes=8;level=9;window_bits=10;max_compressed_bytes_"
|
Add `CompressionOptions::checksum` for enabling ZSTD checksum (#11666)
Summary:
Optionally enable zstd checksum flag (https://github.com/facebook/zstd/blob/d857369028d997c92ff1f1861a4d7f679a125464/lib/zstd.h#L428) to detect corruption during decompression. Main changes are in compression.h:
* User can set CompressionOptions::checksum to true to enable this feature.
* We enable this feature in ZSTD by setting the checksum flag in ZSTD compression context: `ZSTD_CCtx`.
* Uses `ZSTD_compress2()` to do compression since it supports frame parameter like the checksum flag. Compression level is also set in compression context as a flag.
* Error handling during decompression to propagate error message from ZSTD.
* Updated microbench to test read performance impact.
About compatibility, the current compression decoders should continue to work with the data created by the new compression API `ZSTD_compress2()`: https://github.com/facebook/zstd/issues/3711.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11666
Test Plan:
* Existing unit tests for zstd compression
* Add unit test `DBTest2.ZSTDChecksum` to test the corruption case
* Manually tested that compression levels, parallel compression, dictionary compression, index compression all work with the new ZSTD_compress2() API.
* Manually tested with `sst_dump --command=recompress` that different compression levels and dictionary compression settings all work.
* Manually tested compiling with older versions of ZSTD: v1.3.8, v1.1.0, v0.6.2.
* Perf impact: from public benchmark data: http://fastcompression.blogspot.com/2019/03/presenting-xxh3.html for checksum and https://github.com/facebook/zstd#benchmarks, if decompression is 1700MB/s and checksum computation is 70000MB/s, checksum computation is an additional ~2.4% time for decompression. Compression is slower and checksumming should be less noticeable.
* Microbench:
```
TEST_TMPDIR=/dev/shm ./branch_db_basic_bench --benchmark_filter=DBGet/comp_style:0/max_data:1048576/per_key_size:256/enable_statistics:0/negative_query:0/enable_filter:0/mmap:0/compression_type:7/compression_checksum:1/no_blockcache:1/iterations:10000/threads:1 --benchmark_repetitions=100
Min out of 100 runs:
Main:
10390 10436 10456 10484 10499 10535 10544 10545 10565 10568
After this PR, checksum=false
10285 10397 10503 10508 10515 10557 10562 10635 10640 10660
After this PR, checksum=true
10827 10876 10925 10949 10971 11052 11061 11063 11100 11109
```
* db_bench:
```
Write perf
TEST_TMPDIR=/dev/shm/ ./db_bench_ichecksum --benchmarks=fillseq[-X10] --compression_type=zstd --num=10000000 --compression_checksum=..
[FillSeq checksum=0]
fillseq [AVG 10 runs] : 281635 (± 31711) ops/sec; 31.2 (± 3.5) MB/sec
fillseq [MEDIAN 10 runs] : 294027 ops/sec; 32.5 MB/sec
[FillSeq checksum=1]
fillseq [AVG 10 runs] : 286961 (± 34700) ops/sec; 31.7 (± 3.8) MB/sec
fillseq [MEDIAN 10 runs] : 283278 ops/sec; 31.3 MB/sec
Read perf
TEST_TMPDIR=/dev/shm ./db_bench_ichecksum --benchmarks=readrandom[-X20] --num=100000000 --reads=1000000 --use_existing_db=true --readonly=1
[Readrandom checksum=1]
readrandom [AVG 20 runs] : 360928 (± 3579) ops/sec; 4.0 (± 0.0) MB/sec
readrandom [MEDIAN 20 runs] : 362468 ops/sec; 4.0 MB/sec
[Readrandom checksum=0]
readrandom [AVG 20 runs] : 380365 (± 2384) ops/sec; 4.2 (± 0.0) MB/sec
readrandom [MEDIAN 20 runs] : 379800 ops/sec; 4.2 MB/sec
Compression
TEST_TMPDIR=/dev/shm ./db_bench_ichecksum --benchmarks=compress[-X20] --compression_type=zstd --num=100000000 --compression_checksum=1
checksum=1
compress [AVG 20 runs] : 54074 (± 634) ops/sec; 211.2 (± 2.5) MB/sec
compress [MEDIAN 20 runs] : 54396 ops/sec; 212.5 MB/sec
checksum=0
compress [AVG 20 runs] : 54598 (± 393) ops/sec; 213.3 (± 1.5) MB/sec
compress [MEDIAN 20 runs] : 54592 ops/sec; 213.3 MB/sec
Decompression:
TEST_TMPDIR=/dev/shm ./db_bench_ichecksum --benchmarks=uncompress[-X20] --compression_type=zstd --compression_checksum=1
checksum = 0
uncompress [AVG 20 runs] : 167499 (± 962) ops/sec; 654.3 (± 3.8) MB/sec
uncompress [MEDIAN 20 runs] : 167210 ops/sec; 653.2 MB/sec
checksum = 1
uncompress [AVG 20 runs] : 167980 (± 924) ops/sec; 656.2 (± 3.6) MB/sec
uncompress [MEDIAN 20 runs] : 168465 ops/sec; 658.1 MB/sec
```
Reviewed By: ajkr
Differential Revision: D48019378
Pulled By: cbi42
fbshipit-source-id: 674120c6e1853c2ced1436ac8138559d0204feba
2023-08-18 22:01:59 +00:00
|
|
|
"per_kb=876;checksum=true};"
|
2016-05-09 22:57:19 +00:00
|
|
|
"bottommost_compression=kDisableCompressionOption;"
|
2016-04-11 18:39:51 +00:00
|
|
|
"level0_stop_writes_trigger=33;"
|
|
|
|
"num_levels=99;"
|
|
|
|
"level0_slowdown_writes_trigger=22;"
|
|
|
|
"level0_file_num_compaction_trigger=14;"
|
|
|
|
"compaction_filter=urxcqstuwnCompactionFilter;"
|
|
|
|
"soft_pending_compaction_bytes_limit=0;"
|
|
|
|
"max_write_buffer_number_to_maintain=84;"
|
Refactor trimming logic for immutable memtables (#5022)
Summary:
MyRocks currently sets `max_write_buffer_number_to_maintain` in order to maintain enough history for transaction conflict checking. The effectiveness of this approach depends on the size of memtables. When memtables are small, it may not keep enough history; when memtables are large, this may consume too much memory.
We are proposing a new way to configure memtable list history: by limiting the memory usage of immutable memtables. The new option is `max_write_buffer_size_to_maintain` and it will take precedence over the old `max_write_buffer_number_to_maintain` if they are both set to non-zero values. The new option accounts for the total memory usage of flushed immutable memtables and mutable memtable. When the total usage exceeds the limit, RocksDB may start dropping immutable memtables (which is also called trimming history), starting from the oldest one.
The semantics of the old option actually works both as an upper bound and lower bound. History trimming will start if number of immutable memtables exceeds the limit, but it will never go below (limit-1) due to history trimming.
In order the mimic the behavior with the new option, history trimming will stop if dropping the next immutable memtable causes the total memory usage go below the size limit. For example, assuming the size limit is set to 64MB, and there are 3 immutable memtables with sizes of 20, 30, 30. Although the total memory usage is 80MB > 64MB, dropping the oldest memtable will reduce the memory usage to 60MB < 64MB, so in this case no memtable will be dropped.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5022
Differential Revision: D14394062
Pulled By: miasantreble
fbshipit-source-id: 60457a509c6af89d0993f988c9b5c2aa9e45f5c5
2019-08-23 20:54:09 +00:00
|
|
|
"max_write_buffer_size_to_maintain=2147483648;"
|
2016-04-11 18:39:51 +00:00
|
|
|
"merge_operator=aabcxehazrMergeOperator;"
|
2016-06-04 00:02:10 +00:00
|
|
|
"memtable_prefix_bloom_size_ratio=0.4642;"
|
2019-02-19 20:12:25 +00:00
|
|
|
"memtable_whole_key_filtering=true;"
|
2016-11-14 02:58:17 +00:00
|
|
|
"memtable_insert_with_hint_prefix_extractor=rocksdb.CappedPrefix.13;"
|
2020-10-01 17:08:52 +00:00
|
|
|
"check_flush_compaction_key_order=false;"
|
2016-04-11 18:39:51 +00:00
|
|
|
"paranoid_file_checks=true;"
|
2016-10-08 00:21:45 +00:00
|
|
|
"force_consistency_checks=true;"
|
2016-04-11 18:39:51 +00:00
|
|
|
"inplace_update_num_locks=7429;"
|
2022-06-23 16:42:18 +00:00
|
|
|
"experimental_mempurge_threshold=0.0001;"
|
2016-04-11 18:39:51 +00:00
|
|
|
"optimize_filters_for_hits=false;"
|
|
|
|
"level_compaction_dynamic_level_bytes=false;"
|
Align compaction output file boundaries to the next level ones (#10655)
Summary:
Try to align the compaction output file boundaries to the next level ones
(grandparent level), to reduce the level compaction write-amplification.
In level compaction, there are "wasted" data at the beginning and end of the
output level files. Align the file boundary can avoid such "wasted" compaction.
With this PR, it tries to align the non-bottommost level file boundaries to its
next level ones. It may cut file when the file size is large enough (at least
50% of target_file_size) and not too large (2x target_file_size).
db_bench shows about 12.56% compaction reduction:
```
TEST_TMPDIR=/data/dbbench2 ./db_bench --benchmarks=fillrandom,readrandom -max_background_jobs=12 -num=400000000 -target_file_size_base=33554432
# baseline:
Flush(GB): cumulative 25.882, interval 7.216
Cumulative compaction: 285.90 GB write, 162.36 MB/s write, 269.68 GB read, 153.15 MB/s read, 2926.7 seconds
# with this change:
Flush(GB): cumulative 25.882, interval 7.753
Cumulative compaction: 249.97 GB write, 141.96 MB/s write, 233.74 GB read, 132.74 MB/s read, 2534.9 seconds
```
The compaction simulator shows a similar result (14% with 100G random data).
As a side effect, with this PR, the SST file size can exceed the
target_file_size, but is capped at 2x target_file_size. And there will be
smaller files. Here are file size statistics when loading 100GB with the target
file size 32MB:
```
baseline this_PR
count 1.656000e+03 1.705000e+03
mean 3.116062e+07 3.028076e+07
std 7.145242e+06 8.046139e+06
```
The feature is enabled by default, to revert to the old behavior disable it
with `AdvancedColumnFamilyOptions.level_compaction_dynamic_file_size = false`
Also includes https://github.com/facebook/rocksdb/issues/1963 to cut file before skippable grandparent file. Which is for
use case like user adding 2 or more non-overlapping data range at the same
time, it can reduce the overlapping of 2 datasets in the lower levels.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10655
Reviewed By: cbi42
Differential Revision: D39552321
Pulled By: jay-zhuang
fbshipit-source-id: 640d15f159ab0cd973f2426cfc3af266fc8bdde2
2022-09-30 02:43:55 +00:00
|
|
|
"level_compaction_dynamic_file_size=true;"
|
2016-04-11 18:39:51 +00:00
|
|
|
"inplace_update_support=false;"
|
|
|
|
"compaction_style=kCompactionStyleFIFO;"
|
2017-03-02 18:08:49 +00:00
|
|
|
"compaction_pri=kMinOverlappingRatio;"
|
2016-04-11 18:39:51 +00:00
|
|
|
"hard_pending_compaction_bytes_limit=0;"
|
|
|
|
"disable_auto_compactions=false;"
|
2017-10-19 22:19:20 +00:00
|
|
|
"report_bg_io_stats=true;"
|
2018-04-03 04:57:28 +00:00
|
|
|
"ttl=60;"
|
Periodic Compactions (#5166)
Summary:
Introducing Periodic Compactions.
This feature allows all the files in a CF to be periodically compacted. It could help in catching any corruptions that could creep into the DB proactively as every file is constantly getting re-compacted. And also, of course, it helps to cleanup data older than certain threshold.
- Introduced a new option `periodic_compaction_time` to control how long a file can live without being compacted in a CF.
- This works across all levels.
- The files are put in the same level after going through the compaction. (Related files in the same level are picked up as `ExpandInputstoCleanCut` is used).
- Compaction filters, if any, are invoked as usual.
- A new table property, `file_creation_time`, is introduced to implement this feature. This property is set to the time at which the SST file was created (and that time is given by the underlying Env/OS).
This feature can be enabled on its own, or in conjunction with `ttl`. It is possible to set a different time threshold for the bottom level when used in conjunction with ttl. Since `ttl` works only on 0 to last but one levels, you could set `ttl` to, say, 1 day, and `periodic_compaction_time` to, say, 7 days. Since `ttl < periodic_compaction_time` all files in last but one levels keep getting picked up based on ttl, and almost never based on periodic_compaction_time. The files in the bottom level get picked up for compaction based on `periodic_compaction_time`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5166
Differential Revision: D14884441
Pulled By: sagar0
fbshipit-source-id: 408426cbacb409c06386a98632dcf90bfa1bda47
2019-04-11 02:24:25 +00:00
|
|
|
"periodic_compaction_seconds=3600;"
|
2019-03-18 19:07:35 +00:00
|
|
|
"sample_for_compression=0;"
|
2020-08-19 01:31:31 +00:00
|
|
|
"enable_blob_files=true;"
|
|
|
|
"min_blob_size=256;"
|
|
|
|
"blob_file_size=1000000;"
|
|
|
|
"blob_compression_type=kBZip2Compression;"
|
2020-11-13 02:57:20 +00:00
|
|
|
"enable_blob_garbage_collection=true;"
|
|
|
|
"blob_garbage_collection_age_cutoff=0.5;"
|
Make it possible to force the garbage collection of the oldest blob files (#8994)
Summary:
The current BlobDB garbage collection logic works by relocating the valid
blobs from the oldest blob files as they are encountered during compaction,
and cleaning up blob files once they contain nothing but garbage. However,
with sufficiently skewed workloads, it is theoretically possible to end up in a
situation when few or no compactions get scheduled for the SST files that contain
references to the oldest blob files, which can lead to increased space amp due
to the lack of GC.
In order to efficiently handle such workloads, the patch adds a new BlobDB
configuration option called `blob_garbage_collection_force_threshold`,
which signals to BlobDB to schedule targeted compactions for the SST files
that keep alive the oldest batch of blob files if the overall ratio of garbage in
the given blob files meets the threshold *and* all the given blob files are
eligible for GC based on `blob_garbage_collection_age_cutoff`. (For example,
if the new option is set to 0.9, targeted compactions will get scheduled if the
sum of garbage bytes meets or exceeds 90% of the sum of total bytes in the
oldest blob files, assuming all affected blob files are below the age-based cutoff.)
The net result of these targeted compactions is that the valid blobs in the oldest
blob files are relocated and the oldest blob files themselves cleaned up (since
*all* SST files that rely on them get compacted away).
These targeted compactions are similar to periodic compactions in the sense
that they force certain SST files that otherwise would not get picked up to undergo
compaction and also in the sense that instead of merging files from multiple levels,
they target a single file. (Note: such compactions might still include neighboring files
from the same level due to the need of having a "clean cut" boundary but they never
include any files from any other level.)
This functionality is currently only supported with the leveled compaction style
and is inactive by default (since the default value is set to 1.0, i.e. 100%).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8994
Test Plan: Ran `make check` and tested using `db_bench` and the stress/crash tests.
Reviewed By: riversand963
Differential Revision: D31489850
Pulled By: ltamasi
fbshipit-source-id: 44057d511726a0e2a03c5d9313d7511b3f0c4eab
2021-10-12 01:00:44 +00:00
|
|
|
"blob_garbage_collection_force_threshold=0.75;"
|
2021-11-20 01:52:42 +00:00
|
|
|
"blob_compaction_readahead_size=262144;"
|
2022-06-03 03:04:33 +00:00
|
|
|
"blob_file_starting_level=1;"
|
2022-07-17 14:13:59 +00:00
|
|
|
"prepopulate_blob_cache=kDisable;"
|
2022-01-25 22:58:48 +00:00
|
|
|
"bottommost_temperature=kWarm;"
|
2022-08-08 21:36:34 +00:00
|
|
|
"last_level_temperature=kWarm;"
|
2024-02-28 22:36:13 +00:00
|
|
|
"default_write_temperature=kCold;"
|
2023-08-18 00:06:57 +00:00
|
|
|
"default_temperature=kHot;"
|
2022-07-15 04:49:34 +00:00
|
|
|
"preclude_last_level_data_seconds=86400;"
|
2022-10-08 01:49:40 +00:00
|
|
|
"preserve_internal_time_seconds=86400;"
|
2019-02-15 17:48:44 +00:00
|
|
|
"compaction_options_fifo={max_table_files_size=3;allow_"
|
2023-05-11 23:40:59 +00:00
|
|
|
"compaction=true;age_for_warm=0;file_temperature_age_thresholds={{"
|
|
|
|
"temperature=kCold;age=12345}};};"
|
2022-08-12 20:51:32 +00:00
|
|
|
"blob_cache=1M;"
|
2023-04-12 00:50:34 +00:00
|
|
|
"memtable_protection_bytes_per_key=2;"
|
2023-04-25 19:08:23 +00:00
|
|
|
"persist_user_defined_timestamps=true;"
|
2023-08-03 02:58:56 +00:00
|
|
|
"block_protection_bytes_per_key=1;"
|
Delay bottommost level single file compactions (#11701)
Summary:
For leveled compaction, RocksDB has a special kind of compaction with reason "kBottommmostFiles" that compacts bottommost level files to clear data held by snapshots (more detail in https://github.com/facebook/rocksdb/issues/3009). Such compactions can happen soon after a relevant snapshot is released. For some use cases, a bottommost file may contain only a small amount of keys that can be cleared, so compacting such a file has a high write amp. In addition, these bottommost files may be compacted in compactions with reason other than "kBottommmostFiles" if we wait for some time (so that enough data is ingested to trigger such a compaction). This PR introduces an option `bottommost_file_compaction_delay` to specify the delay of these bottommost level single file compactions.
* The main change is in `VersionStorageInfo::ComputeBottommostFilesMarkedForCompaction()` where we only add a file to `bottommost_files_marked_for_compaction_` if it oldest_snapshot is larger than its non-zero largest_seqno **and** the file is old enough. Note that if a file is not old enough but its largest_seqno is less than oldest_snapshot, we exclude it from the calculation of `bottommost_files_mark_threshold_`. This makes the change simpler, but such a file's eligibility for compaction will only be checked the next time `ComputeBottommostFilesMarkedForCompaction()` is called. This happens when a new Version is created (compaction, flush, SetOptions()...), a new enough snapshot is released (`VersionStorageInfo::UpdateOldestSnapshot()`) or when a compaction is picked and compaction score has to be re-calculated.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11701
Test Plan:
* Add two unit tests to test when bottommost_file_compaction_delay > 0.
* Ran crash test with the new option.
Reviewed By: jaykorean, ajkr
Differential Revision: D48331564
Pulled By: cbi42
fbshipit-source-id: c584f3dc5f6354fce3ed65f4c6366dc450b15ba8
2023-08-17 00:45:44 +00:00
|
|
|
"memtable_max_range_deletions=999999;"
|
Support pro-actively erasing obsolete block cache entries (#12694)
Summary:
Currently, when files become obsolete, the block cache entries associated with them just age out naturally. With pure LRU, this is not too bad, as once you "use" enough cache entries to (re-)fill the cache, you are guranteed to have purged the obsolete entries. However, HyperClockCache is a counting clock cache with a somewhat longer memory, so could be more negatively impacted by previously-hot cache entries becoming obsolete, and taking longer to age out than newer single-hit entries.
Part of the reason we still have this natural aging-out is that there's almost no connection between block cache entries and the file they are associated with. Everything is hashed into the same pool(s) of entries with nothing like a secondary index based on file. Keeping track of such an index could be expensive.
This change adds a new, mutable CF option `uncache_aggressiveness` for erasing obsolete block cache entries. The process can be speculative, lossy, or unproductive because not all potential block cache entries associated with files will be resident in memory, and attempting to remove them all could be wasted CPU time. Rather than a simple on/off switch, `uncache_aggressiveness` basically tells RocksDB how much CPU you're willing to burn trying to purge obsolete block cache entries. When such efforts are not sufficiently productive for a file, we stop and move on.
The option is in ColumnFamilyOptions so that it is dynamically changeable for already-open files, and customizeable by CF.
Note that this block cache removal happens as part of the process of purging obsolete files, which is often in a background thread (depending on `background_purge_on_iterator_cleanup` and `avoid_unnecessary_blocking_io` options) rather than along CPU critical paths.
Notable auxiliary code details:
* Possibly fixing some issues with trivial moves with `only_delete_metadata`: unnecessary TableCache::Evict in that case and missing from the ObsoleteFileInfo move operator. (Not able to reproduce an current failure.)
* Remove suspicious TableCache::Erase() from VersionSet::AddObsoleteBlobFile() (TODO follow-up item)
Marked EXPERIMENTAL until more thorough validation is complete.
Direct stats of this functionality are omitted because they could be misleading. Block cache hit rate is a better indicator of benefit, and CPU profiling a better indicator of cost.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12694
Test Plan:
* Unit tests added, including refactoring an existing test to make better use of parameterized tests.
* Added to crash test.
* Performance, sample command:
```
for I in `seq 1 10`; do for UA in 300; do for CT in lru_cache fixed_hyper_clock_cache auto_hyper_clock_cache; do rm -rf /dev/shm/test3; TEST_TMPDIR=/dev/shm/test3 /usr/bin/time ./db_bench -benchmarks=readwhilewriting -num=13000000 -read_random_exp_range=6 -write_buffer_size=10000000 -bloom_bits=10 -cache_type=$CT -cache_size=390000000 -cache_index_and_filter_blocks=1 -disable_wal=1 -duration=60 -statistics -uncache_aggressiveness=$UA 2>&1 | grep -E 'micros/op|rocksdb.block.cache.data.(hit|miss)|rocksdb.number.keys.(read|written)|maxresident' | awk '/rocksdb.block.cache.data.miss/ { miss = $4 } /rocksdb.block.cache.data.hit/ { hit = $4 } { print } END { print "hit rate = " ((hit * 1.0) / (miss + hit)) }' | tee -a results-$CT-$UA; done; done; done
```
Averaging 10 runs each case, block cache data block hit rates
```
lru_cache
UA=0 -> hit rate = 0.327, ops/s = 87668, user CPU sec = 139.0
UA=300 -> hit rate = 0.336, ops/s = 87960, user CPU sec = 139.0
fixed_hyper_clock_cache
UA=0 -> hit rate = 0.336, ops/s = 100069, user CPU sec = 139.9
UA=300 -> hit rate = 0.343, ops/s = 100104, user CPU sec = 140.2
auto_hyper_clock_cache
UA=0 -> hit rate = 0.336, ops/s = 97580, user CPU sec = 140.5
UA=300 -> hit rate = 0.345, ops/s = 97972, user CPU sec = 139.8
```
Conclusion: up to roughly 1 percentage point of improved block cache hit rate, likely leading to overall improved efficiency (because the foreground CPU cost of cache misses likely outweighs the background CPU cost of erasure, let alone I/O savings).
Reviewed By: ajkr
Differential Revision: D57932442
Pulled By: pdillinger
fbshipit-source-id: 84a243ca5f965f731f346a4853009780a904af6c
2024-06-07 15:57:11 +00:00
|
|
|
"bottommost_file_compaction_delay=7200;"
|
2024-08-19 20:53:25 +00:00
|
|
|
"uncache_aggressiveness=1234;"
|
|
|
|
"paranoid_memory_checks=1;",
|
2016-04-11 18:39:51 +00:00
|
|
|
new_options));
|
|
|
|
|
2022-06-14 21:19:26 +00:00
|
|
|
ASSERT_NE(new_options->blob_cache.get(), nullptr);
|
|
|
|
|
2016-04-11 18:39:51 +00:00
|
|
|
ASSERT_EQ(unset_bytes_base,
|
|
|
|
NumUnsetBytes(new_options_ptr, sizeof(ColumnFamilyOptions),
|
2020-06-19 22:26:05 +00:00
|
|
|
kColumnFamilyOptionsExcluded));
|
2016-04-11 18:39:51 +00:00
|
|
|
|
2023-05-11 23:40:59 +00:00
|
|
|
// Custom verification since compaction_options_fifo was in
|
|
|
|
// kColumnFamilyOptionsExcluded
|
|
|
|
ASSERT_EQ(new_options->compaction_options_fifo.max_table_files_size, 3);
|
|
|
|
ASSERT_EQ(new_options->compaction_options_fifo.allow_compaction, true);
|
|
|
|
ASSERT_EQ(new_options->compaction_options_fifo.file_temperature_age_thresholds
|
|
|
|
.size(),
|
|
|
|
1);
|
|
|
|
ASSERT_EQ(
|
|
|
|
new_options->compaction_options_fifo.file_temperature_age_thresholds[0]
|
|
|
|
.temperature,
|
|
|
|
Temperature::kCold);
|
|
|
|
ASSERT_EQ(
|
|
|
|
new_options->compaction_options_fifo.file_temperature_age_thresholds[0]
|
|
|
|
.age,
|
|
|
|
12345);
|
|
|
|
|
2020-04-08 21:37:01 +00:00
|
|
|
ColumnFamilyOptions rnd_filled_options = *new_options;
|
|
|
|
|
2016-04-11 18:39:51 +00:00
|
|
|
options->~ColumnFamilyOptions();
|
|
|
|
new_options->~ColumnFamilyOptions();
|
|
|
|
|
|
|
|
delete[] options_ptr;
|
|
|
|
delete[] new_options_ptr;
|
2020-04-08 21:37:01 +00:00
|
|
|
|
|
|
|
// Test copying to mutabable and immutable options and copy back the mutable
|
|
|
|
// part.
|
2020-06-19 22:26:05 +00:00
|
|
|
const OffsetGap kMutableCFOptionsExcluded = {
|
Use -Wno-invalid-offsetof instead of dangerous offset_of hack (#9563)
Summary:
After https://github.com/facebook/rocksdb/issues/9515 added a unique_ptr to Status, we see some
warnings-as-error in some internal builds like this:
```
stderr: rocksdb/src/db/compaction/compaction_job.cc:2839:7: error:
offset of on non-standard-layout type 'struct CompactionServiceResult'
[-Werror,-Winvalid-offsetof]
{offsetof(struct CompactionServiceResult, status),
^ ~~~~~~
```
I see three potential solutions to resolving this:
* Expand our use of an idiom that works around the warning (see offset_of
functions removed in this change, inspired by
https://gist.github.com/graphitemaster/494f21190bb2c63c5516) However,
this construction is invoking undefined behavior that assumes consistent
layout with no compiler-introduced indirection. A compiler incompatible
with our assumptions will likely compile the code and exhibit undefined
behavior.
* Migrate to something in place of offset, like a function mapping
CompactionServiceResult* to Status* (for the `status` field). This might
be required in the long term.
* **Selected:** Use our new C++17 dependency to use offsetof in a well-defined way
when the compiler allows it. From a comment on
https://gist.github.com/graphitemaster/494f21190bb2c63c5516:
> A final note: in C++17, offsetof is conditionally supported, which
> means that you can use it on any type (not just standard layout
> types) and the compiler will error if it can't compile it correctly.
> That appears to be the best option if you can live with C++17 and
> don't need constexpr support.
The C++17 semantics are confirmed on
https://en.cppreference.com/w/cpp/types/offsetof, so we can suppress the
warning as long as we accept that we might run into a compiler that
rejects the code, and at that point we will find a solution, such as
the more intrusive "migrate" solution above.
Although this is currently only showing in our buck build, it will
surely show up also with make and cmake, so I have updated those
configurations as well.
Also in the buck build, -Wno-expansion-to-defined does not appear to be
needed anymore (both current compiler configurations) so I
removed it.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9563
Test Plan: Tried out buck builds with both current compiler configurations
Reviewed By: riversand963
Differential Revision: D34220931
Pulled By: pdillinger
fbshipit-source-id: d39436008259bd1eaaa87c77be69fb2a5b559e1f
2022-02-15 17:18:08 +00:00
|
|
|
{offsetof(struct MutableCFOptions, prefix_extractor),
|
2020-04-08 21:37:01 +00:00
|
|
|
sizeof(std::shared_ptr<const SliceTransform>)},
|
Use -Wno-invalid-offsetof instead of dangerous offset_of hack (#9563)
Summary:
After https://github.com/facebook/rocksdb/issues/9515 added a unique_ptr to Status, we see some
warnings-as-error in some internal builds like this:
```
stderr: rocksdb/src/db/compaction/compaction_job.cc:2839:7: error:
offset of on non-standard-layout type 'struct CompactionServiceResult'
[-Werror,-Winvalid-offsetof]
{offsetof(struct CompactionServiceResult, status),
^ ~~~~~~
```
I see three potential solutions to resolving this:
* Expand our use of an idiom that works around the warning (see offset_of
functions removed in this change, inspired by
https://gist.github.com/graphitemaster/494f21190bb2c63c5516) However,
this construction is invoking undefined behavior that assumes consistent
layout with no compiler-introduced indirection. A compiler incompatible
with our assumptions will likely compile the code and exhibit undefined
behavior.
* Migrate to something in place of offset, like a function mapping
CompactionServiceResult* to Status* (for the `status` field). This might
be required in the long term.
* **Selected:** Use our new C++17 dependency to use offsetof in a well-defined way
when the compiler allows it. From a comment on
https://gist.github.com/graphitemaster/494f21190bb2c63c5516:
> A final note: in C++17, offsetof is conditionally supported, which
> means that you can use it on any type (not just standard layout
> types) and the compiler will error if it can't compile it correctly.
> That appears to be the best option if you can live with C++17 and
> don't need constexpr support.
The C++17 semantics are confirmed on
https://en.cppreference.com/w/cpp/types/offsetof, so we can suppress the
warning as long as we accept that we might run into a compiler that
rejects the code, and at that point we will find a solution, such as
the more intrusive "migrate" solution above.
Although this is currently only showing in our buck build, it will
surely show up also with make and cmake, so I have updated those
configurations as well.
Also in the buck build, -Wno-expansion-to-defined does not appear to be
needed anymore (both current compiler configurations) so I
removed it.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9563
Test Plan: Tried out buck builds with both current compiler configurations
Reviewed By: riversand963
Differential Revision: D34220931
Pulled By: pdillinger
fbshipit-source-id: d39436008259bd1eaaa87c77be69fb2a5b559e1f
2022-02-15 17:18:08 +00:00
|
|
|
{offsetof(struct MutableCFOptions,
|
|
|
|
max_bytes_for_level_multiplier_additional),
|
2020-04-08 21:37:01 +00:00
|
|
|
sizeof(std::vector<int>)},
|
2023-05-11 23:40:59 +00:00
|
|
|
{offsetof(struct MutableCFOptions, compaction_options_fifo),
|
|
|
|
sizeof(struct CompactionOptionsFIFO)},
|
2022-03-08 02:06:19 +00:00
|
|
|
{offsetof(struct MutableCFOptions, compression_per_level),
|
|
|
|
sizeof(std::vector<CompressionType>)},
|
Use -Wno-invalid-offsetof instead of dangerous offset_of hack (#9563)
Summary:
After https://github.com/facebook/rocksdb/issues/9515 added a unique_ptr to Status, we see some
warnings-as-error in some internal builds like this:
```
stderr: rocksdb/src/db/compaction/compaction_job.cc:2839:7: error:
offset of on non-standard-layout type 'struct CompactionServiceResult'
[-Werror,-Winvalid-offsetof]
{offsetof(struct CompactionServiceResult, status),
^ ~~~~~~
```
I see three potential solutions to resolving this:
* Expand our use of an idiom that works around the warning (see offset_of
functions removed in this change, inspired by
https://gist.github.com/graphitemaster/494f21190bb2c63c5516) However,
this construction is invoking undefined behavior that assumes consistent
layout with no compiler-introduced indirection. A compiler incompatible
with our assumptions will likely compile the code and exhibit undefined
behavior.
* Migrate to something in place of offset, like a function mapping
CompactionServiceResult* to Status* (for the `status` field). This might
be required in the long term.
* **Selected:** Use our new C++17 dependency to use offsetof in a well-defined way
when the compiler allows it. From a comment on
https://gist.github.com/graphitemaster/494f21190bb2c63c5516:
> A final note: in C++17, offsetof is conditionally supported, which
> means that you can use it on any type (not just standard layout
> types) and the compiler will error if it can't compile it correctly.
> That appears to be the best option if you can live with C++17 and
> don't need constexpr support.
The C++17 semantics are confirmed on
https://en.cppreference.com/w/cpp/types/offsetof, so we can suppress the
warning as long as we accept that we might run into a compiler that
rejects the code, and at that point we will find a solution, such as
the more intrusive "migrate" solution above.
Although this is currently only showing in our buck build, it will
surely show up also with make and cmake, so I have updated those
configurations as well.
Also in the buck build, -Wno-expansion-to-defined does not appear to be
needed anymore (both current compiler configurations) so I
removed it.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9563
Test Plan: Tried out buck builds with both current compiler configurations
Reviewed By: riversand963
Differential Revision: D34220931
Pulled By: pdillinger
fbshipit-source-id: d39436008259bd1eaaa87c77be69fb2a5b559e1f
2022-02-15 17:18:08 +00:00
|
|
|
{offsetof(struct MutableCFOptions, max_file_size),
|
2020-04-08 21:37:01 +00:00
|
|
|
sizeof(std::vector<uint64_t>)},
|
|
|
|
};
|
|
|
|
|
2020-04-09 18:20:33 +00:00
|
|
|
// For all memory used for options, pre-fill every char. Otherwise, the
|
|
|
|
// padding bytes might be different so that byte-wise comparison doesn't
|
|
|
|
// general equal results even if objects are equal.
|
|
|
|
const char kMySpecialChar = 'x';
|
2020-04-08 21:37:01 +00:00
|
|
|
char* mcfo1_ptr = new char[sizeof(MutableCFOptions)];
|
|
|
|
FillWithSpecialChar(mcfo1_ptr, sizeof(MutableCFOptions),
|
2020-06-19 22:26:05 +00:00
|
|
|
kMutableCFOptionsExcluded, kMySpecialChar);
|
2020-04-08 21:37:01 +00:00
|
|
|
char* mcfo2_ptr = new char[sizeof(MutableCFOptions)];
|
|
|
|
FillWithSpecialChar(mcfo2_ptr, sizeof(MutableCFOptions),
|
2020-06-19 22:26:05 +00:00
|
|
|
kMutableCFOptionsExcluded, kMySpecialChar);
|
2020-04-08 21:37:01 +00:00
|
|
|
|
2020-04-09 18:20:33 +00:00
|
|
|
// A clean column family options is constructed after filling the same special
|
|
|
|
// char as the initial one. So that the padding bytes are the same.
|
|
|
|
char* cfo_clean_ptr = new char[sizeof(ColumnFamilyOptions)];
|
|
|
|
FillWithSpecialChar(cfo_clean_ptr, sizeof(ColumnFamilyOptions),
|
2020-06-19 22:26:05 +00:00
|
|
|
kColumnFamilyOptionsExcluded);
|
2020-04-08 21:37:01 +00:00
|
|
|
rnd_filled_options.num_levels = 66;
|
2020-04-09 18:20:33 +00:00
|
|
|
ColumnFamilyOptions* cfo_clean = new (cfo_clean_ptr) ColumnFamilyOptions();
|
|
|
|
|
|
|
|
MutableCFOptions* mcfo1 =
|
|
|
|
new (mcfo1_ptr) MutableCFOptions(rnd_filled_options);
|
|
|
|
ColumnFamilyOptions cfo_back = BuildColumnFamilyOptions(*cfo_clean, *mcfo1);
|
|
|
|
MutableCFOptions* mcfo2 = new (mcfo2_ptr) MutableCFOptions(cfo_back);
|
2020-04-08 21:37:01 +00:00
|
|
|
|
|
|
|
ASSERT_TRUE(CompareBytes(mcfo1_ptr, mcfo2_ptr, sizeof(MutableCFOptions),
|
2020-06-19 22:26:05 +00:00
|
|
|
kMutableCFOptionsExcluded));
|
2020-04-09 18:20:33 +00:00
|
|
|
|
|
|
|
cfo_clean->~ColumnFamilyOptions();
|
|
|
|
mcfo1->~MutableCFOptions();
|
|
|
|
mcfo2->~MutableCFOptions();
|
|
|
|
delete[] mcfo1_ptr;
|
|
|
|
delete[] mcfo2_ptr;
|
|
|
|
delete[] cfo_clean_ptr;
|
2016-04-11 18:39:51 +00:00
|
|
|
}
|
2023-08-09 22:46:44 +00:00
|
|
|
#endif // !ROCKSDB_ASSERT_STATUS_CHECKED
|
2022-04-19 03:26:37 +00:00
|
|
|
#endif // !ROCKSDB_UBSAN_RUN
|
2016-04-11 18:39:51 +00:00
|
|
|
#endif // !__clang__
|
2017-04-27 19:19:55 +00:00
|
|
|
#endif // OS_LINUX || OS_WIN
|
2016-04-11 18:39:51 +00:00
|
|
|
|
2020-02-20 20:07:53 +00:00
|
|
|
} // namespace ROCKSDB_NAMESPACE
|
2016-04-11 18:39:51 +00:00
|
|
|
|
|
|
|
int main(int argc, char** argv) {
|
2022-10-18 07:35:35 +00:00
|
|
|
ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
|
2016-04-11 18:39:51 +00:00
|
|
|
::testing::InitGoogleTest(&argc, argv);
|
|
|
|
#ifdef GFLAGS
|
|
|
|
ParseCommandLineFlags(&argc, &argv, true);
|
|
|
|
#endif // GFLAGS
|
|
|
|
return RUN_ALL_TESTS();
|
|
|
|
}
|