2016-02-09 23:12:00 +00:00
|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
2017-07-15 23:03:42 +00:00
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
2013-10-16 21:59:46 +00:00
|
|
|
//
|
2011-03-18 22:37:00 +00:00
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
2014-02-05 00:21:47 +00:00
|
|
|
|
Implement XXH3 block checksum type (#9069)
Summary:
XXH3 - latest hash function that is extremely fast on large
data, easily faster than crc32c on most any x86_64 hardware. In
integrating this hash function, I have handled the compression type byte
in a non-standard way to avoid using the streaming API (extra data
movement and active code size because of hash function complexity). This
approach got a thumbs-up from Yann Collet.
Existing functionality change:
* reject bad ChecksumType in options with InvalidArgument
This change split off from https://github.com/facebook/rocksdb/issues/9058 because context-aware checksum is
likely to be handled through different configuration than ChecksumType.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9069
Test Plan:
tests updated, and substantially expanded. Unit tests now check
that we don't accidentally change the values generated by the checksum
algorithms ("schema test") and that we properly handle
invalid/unrecognized checksum types in options or in file footer.
DBTestBase::ChangeOptions (etc.) updated from two to one configuration
changing from default CRC32c ChecksumType. The point of this test code
is to detect possible interactions among features, and the likelihood of
some bad interaction being detected by including configurations other
than XXH3 and CRC32c--and then not detected by stress/crash test--is
extremely low.
Stress/crash test also updated (manual run long enough to see it accepts
new checksum type). db_bench also updated for microbenchmarking
checksums.
### Performance microbenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
./db_bench -benchmarks=crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3
crc32c : 0.200 micros/op 5005220 ops/sec; 19551.6 MB/s (4096 per op)
xxhash : 0.807 micros/op 1238408 ops/sec; 4837.5 MB/s (4096 per op)
xxhash64 : 0.421 micros/op 2376514 ops/sec; 9283.3 MB/s (4096 per op)
xxh3 : 0.171 micros/op 5858391 ops/sec; 22884.3 MB/s (4096 per op)
crc32c : 0.206 micros/op 4859566 ops/sec; 18982.7 MB/s (4096 per op)
xxhash : 0.793 micros/op 1260850 ops/sec; 4925.2 MB/s (4096 per op)
xxhash64 : 0.410 micros/op 2439182 ops/sec; 9528.1 MB/s (4096 per op)
xxh3 : 0.161 micros/op 6202872 ops/sec; 24230.0 MB/s (4096 per op)
crc32c : 0.203 micros/op 4924686 ops/sec; 19237.1 MB/s (4096 per op)
xxhash : 0.839 micros/op 1192388 ops/sec; 4657.8 MB/s (4096 per op)
xxhash64 : 0.424 micros/op 2357391 ops/sec; 9208.6 MB/s (4096 per op)
xxh3 : 0.162 micros/op 6182678 ops/sec; 24151.1 MB/s (4096 per op)
As you can see, especially once warmed up, xxh3 is fastest.
### Performance macrobenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
Test
for I in `seq 1 50`; do for CHK in 0 1 2 3 4; do TEST_TMPDIR=/dev/shm/rocksdb$CHK ./db_bench -benchmarks=fillseq -memtablerep=vector -allow_concurrent_memtable_write=false -num=30000000 -checksum_type=$CHK 2>&1 | grep 'micros/op' | tee -a results-$CHK & done; wait; done
Results (ops/sec)
for FILE in results*; do echo -n "$FILE "; awk '{ s += $5; c++; } END { print 1.0 * s / c; }' < $FILE; done
results-0 252118 # kNoChecksum
results-1 251588 # kCRC32c
results-2 251863 # kxxHash
results-3 252016 # kxxHash64
results-4 252038 # kXXH3
Reviewed By: mrambacher
Differential Revision: D31905249
Pulled By: pdillinger
fbshipit-source-id: cb9b998ebe2523fc7c400eedf62124a78bf4b4d1
2021-10-29 05:13:47 +00:00
|
|
|
#include "rocksdb/table.h"
|
|
|
|
|
|
|
|
#include <gtest/gtest.h>
|
2021-08-12 02:31:44 +00:00
|
|
|
|
2014-01-24 19:09:04 +00:00
|
|
|
#include <algorithm>
|
2023-12-01 19:15:17 +00:00
|
|
|
#include <cstddef>
|
|
|
|
#include <cstdio>
|
2014-06-13 02:03:22 +00:00
|
|
|
#include <iostream>
|
2011-03-18 22:37:00 +00:00
|
|
|
#include <map>
|
2013-07-23 21:42:27 +00:00
|
|
|
#include <memory>
|
2015-09-02 20:58:22 +00:00
|
|
|
#include <string>
|
Experimental support for SST unique IDs (#8990)
Summary:
* New public header unique_id.h and function GetUniqueIdFromTableProperties
which computes a universally unique identifier based on table properties
of table files from recent RocksDB versions.
* Generation of DB session IDs is refactored so that they are
guaranteed unique in the lifetime of a process running RocksDB.
(SemiStructuredUniqueIdGen, new test included.) Along with file numbers,
this enables SST unique IDs to be guaranteed unique among SSTs generated
in a single process, and "better than random" between processes.
See https://github.com/pdillinger/unique_id
* In addition to public API producing 'external' unique IDs, there is a function
for producing 'internal' unique IDs, with functions for converting between the
two. In short, the external ID is "safe" for things people might do with it, and
the internal ID enables more "power user" features for the future. Specifically,
the external ID goes through a hashing layer so that any subset of bits in the
external ID can be used as a hash of the full ID, while also preserving
uniqueness guarantees in the first 128 bits (bijective both on first 128 bits
and on full 192 bits).
Intended follow-up:
* Use the internal unique IDs in cache keys. (Avoid conflicts with https://github.com/facebook/rocksdb/issues/8912) (The file offset can be XORed into
the third 64-bit value of the unique ID.)
* Publish the external unique IDs in FileStorageInfo (https://github.com/facebook/rocksdb/issues/8968)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8990
Test Plan:
Unit tests added, and checking of unique ids in stress test.
NOTE in stress test we do not generate nearly enough files to thoroughly
stress uniqueness, but the test trims off pieces of the ID to check for
uniqueness so that we can infer (with some assumptions) stronger
properties in the aggregate.
Reviewed By: zhichao-cao, mrambacher
Differential Revision: D31582865
Pulled By: pdillinger
fbshipit-source-id: 1f620c4c86af9abe2a8d177b9ccf2ad2b9f48243
2021-10-19 06:28:28 +00:00
|
|
|
#include <unordered_set>
|
2013-10-10 18:43:24 +00:00
|
|
|
#include <vector>
|
|
|
|
|
2017-05-06 03:10:56 +00:00
|
|
|
#include "cache/lru_cache.h"
|
2022-05-17 22:01:51 +00:00
|
|
|
#include "db/db_test_util.h"
|
2011-03-18 22:37:00 +00:00
|
|
|
#include "db/dbformat.h"
|
|
|
|
#include "db/memtable.h"
|
|
|
|
#include "db/write_batch_internal.h"
|
2015-10-16 21:10:33 +00:00
|
|
|
#include "memtable/stl_wrappers.h"
|
2023-05-17 18:27:09 +00:00
|
|
|
#include "monitoring/statistics_impl.h"
|
Implement XXH3 block checksum type (#9069)
Summary:
XXH3 - latest hash function that is extremely fast on large
data, easily faster than crc32c on most any x86_64 hardware. In
integrating this hash function, I have handled the compression type byte
in a non-standard way to avoid using the streaming API (extra data
movement and active code size because of hash function complexity). This
approach got a thumbs-up from Yann Collet.
Existing functionality change:
* reject bad ChecksumType in options with InvalidArgument
This change split off from https://github.com/facebook/rocksdb/issues/9058 because context-aware checksum is
likely to be handled through different configuration than ChecksumType.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9069
Test Plan:
tests updated, and substantially expanded. Unit tests now check
that we don't accidentally change the values generated by the checksum
algorithms ("schema test") and that we properly handle
invalid/unrecognized checksum types in options or in file footer.
DBTestBase::ChangeOptions (etc.) updated from two to one configuration
changing from default CRC32c ChecksumType. The point of this test code
is to detect possible interactions among features, and the likelihood of
some bad interaction being detected by including configurations other
than XXH3 and CRC32c--and then not detected by stress/crash test--is
extremely low.
Stress/crash test also updated (manual run long enough to see it accepts
new checksum type). db_bench also updated for microbenchmarking
checksums.
### Performance microbenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
./db_bench -benchmarks=crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3
crc32c : 0.200 micros/op 5005220 ops/sec; 19551.6 MB/s (4096 per op)
xxhash : 0.807 micros/op 1238408 ops/sec; 4837.5 MB/s (4096 per op)
xxhash64 : 0.421 micros/op 2376514 ops/sec; 9283.3 MB/s (4096 per op)
xxh3 : 0.171 micros/op 5858391 ops/sec; 22884.3 MB/s (4096 per op)
crc32c : 0.206 micros/op 4859566 ops/sec; 18982.7 MB/s (4096 per op)
xxhash : 0.793 micros/op 1260850 ops/sec; 4925.2 MB/s (4096 per op)
xxhash64 : 0.410 micros/op 2439182 ops/sec; 9528.1 MB/s (4096 per op)
xxh3 : 0.161 micros/op 6202872 ops/sec; 24230.0 MB/s (4096 per op)
crc32c : 0.203 micros/op 4924686 ops/sec; 19237.1 MB/s (4096 per op)
xxhash : 0.839 micros/op 1192388 ops/sec; 4657.8 MB/s (4096 per op)
xxhash64 : 0.424 micros/op 2357391 ops/sec; 9208.6 MB/s (4096 per op)
xxh3 : 0.162 micros/op 6182678 ops/sec; 24151.1 MB/s (4096 per op)
As you can see, especially once warmed up, xxh3 is fastest.
### Performance macrobenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
Test
for I in `seq 1 50`; do for CHK in 0 1 2 3 4; do TEST_TMPDIR=/dev/shm/rocksdb$CHK ./db_bench -benchmarks=fillseq -memtablerep=vector -allow_concurrent_memtable_write=false -num=30000000 -checksum_type=$CHK 2>&1 | grep 'micros/op' | tee -a results-$CHK & done; wait; done
Results (ops/sec)
for FILE in results*; do echo -n "$FILE "; awk '{ s += $5; c++; } END { print 1.0 * s / c; }' < $FILE; done
results-0 252118 # kNoChecksum
results-1 251588 # kCRC32c
results-2 251863 # kxxHash
results-3 252016 # kxxHash64
results-4 252038 # kXXH3
Reviewed By: mrambacher
Differential Revision: D31905249
Pulled By: pdillinger
fbshipit-source-id: cb9b998ebe2523fc7c400eedf62124a78bf4b4d1
2021-10-29 05:13:47 +00:00
|
|
|
#include "options/options_helper.h"
|
2017-02-06 22:43:55 +00:00
|
|
|
#include "port/port.h"
|
2021-12-10 16:12:09 +00:00
|
|
|
#include "port/stack_trace.h"
|
2013-11-13 06:46:51 +00:00
|
|
|
#include "rocksdb/cache.h"
|
Implement XXH3 block checksum type (#9069)
Summary:
XXH3 - latest hash function that is extremely fast on large
data, easily faster than crc32c on most any x86_64 hardware. In
integrating this hash function, I have handled the compression type byte
in a non-standard way to avoid using the streaming API (extra data
movement and active code size because of hash function complexity). This
approach got a thumbs-up from Yann Collet.
Existing functionality change:
* reject bad ChecksumType in options with InvalidArgument
This change split off from https://github.com/facebook/rocksdb/issues/9058 because context-aware checksum is
likely to be handled through different configuration than ChecksumType.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9069
Test Plan:
tests updated, and substantially expanded. Unit tests now check
that we don't accidentally change the values generated by the checksum
algorithms ("schema test") and that we properly handle
invalid/unrecognized checksum types in options or in file footer.
DBTestBase::ChangeOptions (etc.) updated from two to one configuration
changing from default CRC32c ChecksumType. The point of this test code
is to detect possible interactions among features, and the likelihood of
some bad interaction being detected by including configurations other
than XXH3 and CRC32c--and then not detected by stress/crash test--is
extremely low.
Stress/crash test also updated (manual run long enough to see it accepts
new checksum type). db_bench also updated for microbenchmarking
checksums.
### Performance microbenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
./db_bench -benchmarks=crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3
crc32c : 0.200 micros/op 5005220 ops/sec; 19551.6 MB/s (4096 per op)
xxhash : 0.807 micros/op 1238408 ops/sec; 4837.5 MB/s (4096 per op)
xxhash64 : 0.421 micros/op 2376514 ops/sec; 9283.3 MB/s (4096 per op)
xxh3 : 0.171 micros/op 5858391 ops/sec; 22884.3 MB/s (4096 per op)
crc32c : 0.206 micros/op 4859566 ops/sec; 18982.7 MB/s (4096 per op)
xxhash : 0.793 micros/op 1260850 ops/sec; 4925.2 MB/s (4096 per op)
xxhash64 : 0.410 micros/op 2439182 ops/sec; 9528.1 MB/s (4096 per op)
xxh3 : 0.161 micros/op 6202872 ops/sec; 24230.0 MB/s (4096 per op)
crc32c : 0.203 micros/op 4924686 ops/sec; 19237.1 MB/s (4096 per op)
xxhash : 0.839 micros/op 1192388 ops/sec; 4657.8 MB/s (4096 per op)
xxhash64 : 0.424 micros/op 2357391 ops/sec; 9208.6 MB/s (4096 per op)
xxh3 : 0.162 micros/op 6182678 ops/sec; 24151.1 MB/s (4096 per op)
As you can see, especially once warmed up, xxh3 is fastest.
### Performance macrobenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
Test
for I in `seq 1 50`; do for CHK in 0 1 2 3 4; do TEST_TMPDIR=/dev/shm/rocksdb$CHK ./db_bench -benchmarks=fillseq -memtablerep=vector -allow_concurrent_memtable_write=false -num=30000000 -checksum_type=$CHK 2>&1 | grep 'micros/op' | tee -a results-$CHK & done; wait; done
Results (ops/sec)
for FILE in results*; do echo -n "$FILE "; awk '{ s += $5; c++; } END { print 1.0 * s / c; }' < $FILE; done
results-0 252118 # kNoChecksum
results-1 251588 # kCRC32c
results-2 251863 # kxxHash
results-3 252016 # kxxHash64
results-4 252038 # kXXH3
Reviewed By: mrambacher
Differential Revision: D31905249
Pulled By: pdillinger
fbshipit-source-id: cb9b998ebe2523fc7c400eedf62124a78bf4b4d1
2021-10-29 05:13:47 +00:00
|
|
|
#include "rocksdb/compression_type.h"
|
Hide deprecated, inefficient block-based filter from public API (#9535)
Summary:
This change removes the ability to configure the deprecated,
inefficient block-based filter in the public API. Options that would
have enabled it now use "full" (and optionally partitioned) filters.
Existing block-based filters can still be read and used, and a "back
door" way to build them still exists, for testing and in case of trouble.
About the only way this removal would cause an issue for users is if
temporary memory for filter construction greatly increases. In
HISTORY.md we suggest a few possible mitigations: partitioned filters,
smaller SST files, or setting reserve_table_builder_memory=true.
Or users who have customized a FilterPolicy using the
CreateFilter/KeyMayMatch mechanism removed in https://github.com/facebook/rocksdb/issues/9501 will have to upgrade
their code. (It's long past time for people to move to the new
builder/reader customization interface.)
This change also introduces some internal-use-only configuration strings
for testing specific filter implementations while bypassing some
compatibility / intelligence logic. This is intended to hint at a path
toward making FilterPolicy Customizable, but it also gives us a "back
door" way to configure block-based filter.
Aside: updated db_bench so that -readonly implies -use_existing_db
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9535
Test Plan:
Unit tests updated. Specifically,
* BlockBasedTableTest.BlockReadCountTest is tweaked to validate the back
door configuration interface and ignoring of `use_block_based_builder`.
* BlockBasedTableTest.TracingGetTest is migrated from testing
block-based filter access pattern to full filter access patter, by
re-ordering some things.
* Options test (pretty self-explanatory)
Performance test - create with `./db_bench -db=/dev/shm/rocksdb1 -bloom_bits=10 -cache_index_and_filter_blocks=1 -benchmarks=fillrandom -num=10000000 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=0` with and without `-use_block_based_filter`, which creates a DB with 21 SST files in L0. Read with `./db_bench -db=/dev/shm/rocksdb1 -readonly -bloom_bits=10 -cache_index_and_filter_blocks=1 -benchmarks=readrandom -num=10000000 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=0 -duration=30`
Without -use_block_based_filter: readrandom 464 ops/sec, 689280 KB DB
With -use_block_based_filter: readrandom 169 ops/sec, 690996 KB DB
No consistent difference with fillrandom
Reviewed By: jay-zhuang
Differential Revision: D34153871
Pulled By: pdillinger
fbshipit-source-id: 31f4a933c542f8f09aca47fa64aec67832a69738
2022-02-12 15:04:09 +00:00
|
|
|
#include "rocksdb/convenience.h"
|
2013-08-23 15:38:13 +00:00
|
|
|
#include "rocksdb/db.h"
|
|
|
|
#include "rocksdb/env.h"
|
2020-02-10 23:42:46 +00:00
|
|
|
#include "rocksdb/file_checksum.h"
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
#include "rocksdb/file_system.h"
|
2021-09-29 11:01:57 +00:00
|
|
|
#include "rocksdb/filter_policy.h"
|
2013-08-23 15:38:13 +00:00
|
|
|
#include "rocksdb/iterator.h"
|
|
|
|
#include "rocksdb/memtablerep.h"
|
2022-07-24 00:38:49 +00:00
|
|
|
#include "rocksdb/options.h"
|
2015-09-02 22:36:47 +00:00
|
|
|
#include "rocksdb/perf_context.h"
|
2014-03-01 02:19:07 +00:00
|
|
|
#include "rocksdb/slice_transform.h"
|
|
|
|
#include "rocksdb/statistics.h"
|
Experimental support for SST unique IDs (#8990)
Summary:
* New public header unique_id.h and function GetUniqueIdFromTableProperties
which computes a universally unique identifier based on table properties
of table files from recent RocksDB versions.
* Generation of DB session IDs is refactored so that they are
guaranteed unique in the lifetime of a process running RocksDB.
(SemiStructuredUniqueIdGen, new test included.) Along with file numbers,
this enables SST unique IDs to be guaranteed unique among SSTs generated
in a single process, and "better than random" between processes.
See https://github.com/pdillinger/unique_id
* In addition to public API producing 'external' unique IDs, there is a function
for producing 'internal' unique IDs, with functions for converting between the
two. In short, the external ID is "safe" for things people might do with it, and
the internal ID enables more "power user" features for the future. Specifically,
the external ID goes through a hashing layer so that any subset of bits in the
external ID can be used as a hash of the full ID, while also preserving
uniqueness guarantees in the first 128 bits (bijective both on first 128 bits
and on full 192 bits).
Intended follow-up:
* Use the internal unique IDs in cache keys. (Avoid conflicts with https://github.com/facebook/rocksdb/issues/8912) (The file offset can be XORed into
the third 64-bit value of the unique ID.)
* Publish the external unique IDs in FileStorageInfo (https://github.com/facebook/rocksdb/issues/8968)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8990
Test Plan:
Unit tests added, and checking of unique ids in stress test.
NOTE in stress test we do not generate nearly enough files to thoroughly
stress uniqueness, but the test trims off pieces of the ID to check for
uniqueness so that we can infer (with some assumptions) stronger
properties in the aggregate.
Reviewed By: zhichao-cao, mrambacher
Differential Revision: D31582865
Pulled By: pdillinger
fbshipit-source-id: 1f620c4c86af9abe2a8d177b9ccf2ad2b9f48243
2021-10-19 06:28:28 +00:00
|
|
|
#include "rocksdb/table_properties.h"
|
2021-08-12 02:31:44 +00:00
|
|
|
#include "rocksdb/trace_record.h"
|
Experimental support for SST unique IDs (#8990)
Summary:
* New public header unique_id.h and function GetUniqueIdFromTableProperties
which computes a universally unique identifier based on table properties
of table files from recent RocksDB versions.
* Generation of DB session IDs is refactored so that they are
guaranteed unique in the lifetime of a process running RocksDB.
(SemiStructuredUniqueIdGen, new test included.) Along with file numbers,
this enables SST unique IDs to be guaranteed unique among SSTs generated
in a single process, and "better than random" between processes.
See https://github.com/pdillinger/unique_id
* In addition to public API producing 'external' unique IDs, there is a function
for producing 'internal' unique IDs, with functions for converting between the
two. In short, the external ID is "safe" for things people might do with it, and
the internal ID enables more "power user" features for the future. Specifically,
the external ID goes through a hashing layer so that any subset of bits in the
external ID can be used as a hash of the full ID, while also preserving
uniqueness guarantees in the first 128 bits (bijective both on first 128 bits
and on full 192 bits).
Intended follow-up:
* Use the internal unique IDs in cache keys. (Avoid conflicts with https://github.com/facebook/rocksdb/issues/8912) (The file offset can be XORed into
the third 64-bit value of the unique ID.)
* Publish the external unique IDs in FileStorageInfo (https://github.com/facebook/rocksdb/issues/8968)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8990
Test Plan:
Unit tests added, and checking of unique ids in stress test.
NOTE in stress test we do not generate nearly enough files to thoroughly
stress uniqueness, but the test trims off pieces of the ID to check for
uniqueness so that we can infer (with some assumptions) stronger
properties in the aggregate.
Reviewed By: zhichao-cao, mrambacher
Differential Revision: D31582865
Pulled By: pdillinger
fbshipit-source-id: 1f620c4c86af9abe2a8d177b9ccf2ad2b9f48243
2021-10-19 06:28:28 +00:00
|
|
|
#include "rocksdb/unique_id.h"
|
2016-06-21 01:01:03 +00:00
|
|
|
#include "rocksdb/write_buffer_manager.h"
|
2019-05-30 21:47:29 +00:00
|
|
|
#include "table/block_based/block.h"
|
|
|
|
#include "table/block_based/block_based_table_builder.h"
|
|
|
|
#include "table/block_based/block_based_table_factory.h"
|
2023-12-06 21:48:15 +00:00
|
|
|
#include "table/block_based/block_based_table_iterator.h"
|
2019-05-30 21:47:29 +00:00
|
|
|
#include "table/block_based/block_based_table_reader.h"
|
|
|
|
#include "table/block_based/block_builder.h"
|
Hide deprecated, inefficient block-based filter from public API (#9535)
Summary:
This change removes the ability to configure the deprecated,
inefficient block-based filter in the public API. Options that would
have enabled it now use "full" (and optionally partitioned) filters.
Existing block-based filters can still be read and used, and a "back
door" way to build them still exists, for testing and in case of trouble.
About the only way this removal would cause an issue for users is if
temporary memory for filter construction greatly increases. In
HISTORY.md we suggest a few possible mitigations: partitioned filters,
smaller SST files, or setting reserve_table_builder_memory=true.
Or users who have customized a FilterPolicy using the
CreateFilter/KeyMayMatch mechanism removed in https://github.com/facebook/rocksdb/issues/9501 will have to upgrade
their code. (It's long past time for people to move to the new
builder/reader customization interface.)
This change also introduces some internal-use-only configuration strings
for testing specific filter implementations while bypassing some
compatibility / intelligence logic. This is intended to hint at a path
toward making FilterPolicy Customizable, but it also gives us a "back
door" way to configure block-based filter.
Aside: updated db_bench so that -readonly implies -use_existing_db
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9535
Test Plan:
Unit tests updated. Specifically,
* BlockBasedTableTest.BlockReadCountTest is tweaked to validate the back
door configuration interface and ignoring of `use_block_based_builder`.
* BlockBasedTableTest.TracingGetTest is migrated from testing
block-based filter access pattern to full filter access patter, by
re-ordering some things.
* Options test (pretty self-explanatory)
Performance test - create with `./db_bench -db=/dev/shm/rocksdb1 -bloom_bits=10 -cache_index_and_filter_blocks=1 -benchmarks=fillrandom -num=10000000 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=0` with and without `-use_block_based_filter`, which creates a DB with 21 SST files in L0. Read with `./db_bench -db=/dev/shm/rocksdb1 -readonly -bloom_bits=10 -cache_index_and_filter_blocks=1 -benchmarks=readrandom -num=10000000 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=0 -duration=30`
Without -use_block_based_filter: readrandom 464 ops/sec, 689280 KB DB
With -use_block_based_filter: readrandom 169 ops/sec, 690996 KB DB
No consistent difference with fillrandom
Reviewed By: jay-zhuang
Differential Revision: D34153871
Pulled By: pdillinger
fbshipit-source-id: 31f4a933c542f8f09aca47fa64aec67832a69738
2022-02-12 15:04:09 +00:00
|
|
|
#include "table/block_based/filter_policy_internal.h"
|
2023-05-17 18:27:09 +00:00
|
|
|
#include "table/block_based/flush_block_policy_impl.h"
|
2021-12-10 16:12:09 +00:00
|
|
|
#include "table/block_fetcher.h"
|
2011-03-18 22:37:00 +00:00
|
|
|
#include "table/format.h"
|
2015-09-02 20:58:22 +00:00
|
|
|
#include "table/get_context.h"
|
2015-10-12 22:06:38 +00:00
|
|
|
#include "table/internal_iterator.h"
|
2021-12-10 16:12:09 +00:00
|
|
|
#include "table/meta_blocks.h"
|
2019-05-30 21:47:29 +00:00
|
|
|
#include "table/plain/plain_table_factory.h"
|
2016-10-18 23:59:37 +00:00
|
|
|
#include "table/sst_file_writer_collectors.h"
|
Experimental support for SST unique IDs (#8990)
Summary:
* New public header unique_id.h and function GetUniqueIdFromTableProperties
which computes a universally unique identifier based on table properties
of table files from recent RocksDB versions.
* Generation of DB session IDs is refactored so that they are
guaranteed unique in the lifetime of a process running RocksDB.
(SemiStructuredUniqueIdGen, new test included.) Along with file numbers,
this enables SST unique IDs to be guaranteed unique among SSTs generated
in a single process, and "better than random" between processes.
See https://github.com/pdillinger/unique_id
* In addition to public API producing 'external' unique IDs, there is a function
for producing 'internal' unique IDs, with functions for converting between the
two. In short, the external ID is "safe" for things people might do with it, and
the internal ID enables more "power user" features for the future. Specifically,
the external ID goes through a hashing layer so that any subset of bits in the
external ID can be used as a hash of the full ID, while also preserving
uniqueness guarantees in the first 128 bits (bijective both on first 128 bits
and on full 192 bits).
Intended follow-up:
* Use the internal unique IDs in cache keys. (Avoid conflicts with https://github.com/facebook/rocksdb/issues/8912) (The file offset can be XORed into
the third 64-bit value of the unique ID.)
* Publish the external unique IDs in FileStorageInfo (https://github.com/facebook/rocksdb/issues/8968)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8990
Test Plan:
Unit tests added, and checking of unique ids in stress test.
NOTE in stress test we do not generate nearly enough files to thoroughly
stress uniqueness, but the test trims off pieces of the ID to check for
uniqueness so that we can infer (with some assumptions) stronger
properties in the aggregate.
Reviewed By: zhichao-cao, mrambacher
Differential Revision: D31582865
Pulled By: pdillinger
fbshipit-source-id: 1f620c4c86af9abe2a8d177b9ccf2ad2b9f48243
2021-10-19 06:28:28 +00:00
|
|
|
#include "table/unique_id_impl.h"
|
2019-05-30 18:21:38 +00:00
|
|
|
#include "test_util/sync_point.h"
|
|
|
|
#include "test_util/testharness.h"
|
|
|
|
#include "test_util/testutil.h"
|
Experimental support for SST unique IDs (#8990)
Summary:
* New public header unique_id.h and function GetUniqueIdFromTableProperties
which computes a universally unique identifier based on table properties
of table files from recent RocksDB versions.
* Generation of DB session IDs is refactored so that they are
guaranteed unique in the lifetime of a process running RocksDB.
(SemiStructuredUniqueIdGen, new test included.) Along with file numbers,
this enables SST unique IDs to be guaranteed unique among SSTs generated
in a single process, and "better than random" between processes.
See https://github.com/pdillinger/unique_id
* In addition to public API producing 'external' unique IDs, there is a function
for producing 'internal' unique IDs, with functions for converting between the
two. In short, the external ID is "safe" for things people might do with it, and
the internal ID enables more "power user" features for the future. Specifically,
the external ID goes through a hashing layer so that any subset of bits in the
external ID can be used as a hash of the full ID, while also preserving
uniqueness guarantees in the first 128 bits (bijective both on first 128 bits
and on full 192 bits).
Intended follow-up:
* Use the internal unique IDs in cache keys. (Avoid conflicts with https://github.com/facebook/rocksdb/issues/8912) (The file offset can be XORed into
the third 64-bit value of the unique ID.)
* Publish the external unique IDs in FileStorageInfo (https://github.com/facebook/rocksdb/issues/8968)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8990
Test Plan:
Unit tests added, and checking of unique ids in stress test.
NOTE in stress test we do not generate nearly enough files to thoroughly
stress uniqueness, but the test trims off pieces of the ID to check for
uniqueness so that we can infer (with some assumptions) stronger
properties in the aggregate.
Reviewed By: zhichao-cao, mrambacher
Differential Revision: D31582865
Pulled By: pdillinger
fbshipit-source-id: 1f620c4c86af9abe2a8d177b9ccf2ad2b9f48243
2021-10-19 06:28:28 +00:00
|
|
|
#include "util/coding_lean.h"
|
2019-05-31 00:39:43 +00:00
|
|
|
#include "util/compression.h"
|
2020-02-10 23:42:46 +00:00
|
|
|
#include "util/file_checksum_helper.h"
|
2019-05-31 00:39:43 +00:00
|
|
|
#include "util/random.h"
|
|
|
|
#include "util/string_util.h"
|
2021-12-17 12:19:34 +00:00
|
|
|
#include "utilities/memory_allocators.h"
|
2016-04-21 17:16:28 +00:00
|
|
|
#include "utilities/merge_operators.h"
|
2015-03-03 01:07:03 +00:00
|
|
|
|
2020-02-20 20:07:53 +00:00
|
|
|
namespace ROCKSDB_NAMESPACE {
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2013-11-08 05:27:21 +00:00
|
|
|
namespace {
|
2014-01-24 19:09:04 +00:00
|
|
|
|
2019-07-17 20:02:00 +00:00
|
|
|
const std::string kDummyValue(10000, 'o');
|
|
|
|
|
2016-04-21 17:16:28 +00:00
|
|
|
// DummyPropertiesCollector used to test BlockBasedTableProperties
|
|
|
|
class DummyPropertiesCollector : public TablePropertiesCollector {
|
|
|
|
public:
|
2020-10-01 22:21:17 +00:00
|
|
|
const char* Name() const override { return "DummyPropertiesCollector"; }
|
2016-04-21 17:16:28 +00:00
|
|
|
|
2019-02-14 21:52:47 +00:00
|
|
|
Status Finish(UserCollectedProperties* /*properties*/) override {
|
2018-03-05 21:08:17 +00:00
|
|
|
return Status::OK();
|
|
|
|
}
|
2016-04-21 17:16:28 +00:00
|
|
|
|
2019-02-14 21:52:47 +00:00
|
|
|
Status Add(const Slice& /*user_key*/, const Slice& /*value*/) override {
|
2018-03-05 21:08:17 +00:00
|
|
|
return Status::OK();
|
|
|
|
}
|
2016-04-21 17:16:28 +00:00
|
|
|
|
2019-02-14 21:52:47 +00:00
|
|
|
UserCollectedProperties GetReadableProperties() const override {
|
2016-04-21 17:16:28 +00:00
|
|
|
return UserCollectedProperties{};
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
class DummyPropertiesCollectorFactory1
|
|
|
|
: public TablePropertiesCollectorFactory {
|
|
|
|
public:
|
2019-02-14 21:52:47 +00:00
|
|
|
TablePropertiesCollector* CreateTablePropertiesCollector(
|
|
|
|
TablePropertiesCollectorFactory::Context /*context*/) override {
|
2016-04-21 17:16:28 +00:00
|
|
|
return new DummyPropertiesCollector();
|
|
|
|
}
|
2020-10-01 22:21:17 +00:00
|
|
|
const char* Name() const override {
|
|
|
|
return "DummyPropertiesCollectorFactory1";
|
|
|
|
}
|
2016-04-21 17:16:28 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
class DummyPropertiesCollectorFactory2
|
|
|
|
: public TablePropertiesCollectorFactory {
|
|
|
|
public:
|
2019-02-14 21:52:47 +00:00
|
|
|
TablePropertiesCollector* CreateTablePropertiesCollector(
|
|
|
|
TablePropertiesCollectorFactory::Context /*context*/) override {
|
2016-04-21 17:16:28 +00:00
|
|
|
return new DummyPropertiesCollector();
|
|
|
|
}
|
2020-10-01 22:21:17 +00:00
|
|
|
const char* Name() const override {
|
|
|
|
return "DummyPropertiesCollectorFactory2";
|
|
|
|
}
|
2016-04-21 17:16:28 +00:00
|
|
|
};
|
|
|
|
|
2011-03-18 22:37:00 +00:00
|
|
|
// Return reverse of "key".
|
|
|
|
// Used to test non-lexicographic comparators.
|
2014-01-24 19:09:04 +00:00
|
|
|
std::string Reverse(const Slice& key) {
|
|
|
|
auto rev = key.ToString();
|
|
|
|
std::reverse(rev.begin(), rev.end());
|
2011-03-18 22:37:00 +00:00
|
|
|
return rev;
|
|
|
|
}
|
|
|
|
|
|
|
|
class ReverseKeyComparator : public Comparator {
|
|
|
|
public:
|
2019-02-14 21:52:47 +00:00
|
|
|
const char* Name() const override {
|
2013-10-05 05:32:05 +00:00
|
|
|
return "rocksdb.ReverseBytewiseComparator";
|
2011-03-18 22:37:00 +00:00
|
|
|
}
|
|
|
|
|
2019-02-14 21:52:47 +00:00
|
|
|
int Compare(const Slice& a, const Slice& b) const override {
|
2011-03-18 22:37:00 +00:00
|
|
|
return BytewiseComparator()->Compare(Reverse(a), Reverse(b));
|
|
|
|
}
|
|
|
|
|
2019-02-14 21:52:47 +00:00
|
|
|
void FindShortestSeparator(std::string* start,
|
|
|
|
const Slice& limit) const override {
|
2011-03-18 22:37:00 +00:00
|
|
|
std::string s = Reverse(*start);
|
|
|
|
std::string l = Reverse(limit);
|
|
|
|
BytewiseComparator()->FindShortestSeparator(&s, l);
|
|
|
|
*start = Reverse(s);
|
|
|
|
}
|
|
|
|
|
2019-02-14 21:52:47 +00:00
|
|
|
void FindShortSuccessor(std::string* key) const override {
|
2011-03-18 22:37:00 +00:00
|
|
|
std::string s = Reverse(*key);
|
|
|
|
BytewiseComparator()->FindShortSuccessor(&s);
|
|
|
|
*key = Reverse(s);
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2014-01-24 19:09:04 +00:00
|
|
|
ReverseKeyComparator reverse_key_comparator;
|
|
|
|
|
|
|
|
void Increment(const Comparator* cmp, std::string* key) {
|
2011-03-18 22:37:00 +00:00
|
|
|
if (cmp == BytewiseComparator()) {
|
|
|
|
key->push_back('\0');
|
|
|
|
} else {
|
|
|
|
assert(cmp == &reverse_key_comparator);
|
|
|
|
std::string rev = Reverse(*key);
|
|
|
|
rev.push_back('\0');
|
|
|
|
*key = Reverse(rev);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-04-29 13:59:53 +00:00
|
|
|
const auto kUnknownColumnFamily =
|
|
|
|
TablePropertiesCollectorFactory::Context::kUnknownColumnFamily;
|
|
|
|
|
2011-10-31 17:22:06 +00:00
|
|
|
} // namespace
|
2011-03-18 22:37:00 +00:00
|
|
|
|
|
|
|
// Helper class for tests to unify the interface between
|
|
|
|
// BlockBuilder/TableBuilder and Block/Table.
|
|
|
|
class Constructor {
|
|
|
|
public:
|
2015-09-02 20:58:22 +00:00
|
|
|
explicit Constructor(const Comparator* cmp)
|
|
|
|
: data_(stl_wrappers::LessOfComparator(cmp)) {}
|
2023-12-01 19:15:17 +00:00
|
|
|
virtual ~Constructor() = default;
|
2011-03-18 22:37:00 +00:00
|
|
|
|
|
|
|
void Add(const std::string& key, const Slice& value) {
|
|
|
|
data_[key] = value.ToString();
|
|
|
|
}
|
|
|
|
|
|
|
|
// Finish constructing the data structure with all the keys that have
|
|
|
|
// been added so far. Returns the keys in sorted order in "*keys"
|
|
|
|
// and stores the key/value pairs in "*kvmap"
|
2021-05-05 20:59:21 +00:00
|
|
|
void Finish(const Options& options, const ImmutableOptions& ioptions,
|
2018-05-21 21:33:55 +00:00
|
|
|
const MutableCFOptions& moptions,
|
2014-08-25 21:22:05 +00:00
|
|
|
const BlockBasedTableOptions& table_options,
|
2014-01-27 21:53:22 +00:00
|
|
|
const InternalKeyComparator& internal_comparator,
|
2015-09-02 20:58:22 +00:00
|
|
|
std::vector<std::string>* keys, stl_wrappers::KVMap* kvmap) {
|
New stable, fixed-length cache keys (#9126)
Summary:
This change standardizes on a new 16-byte cache key format for
block cache (incl compressed and secondary) and persistent cache (but
not table cache and row cache).
The goal is a really fast cache key with practically ideal stability and
uniqueness properties without external dependencies (e.g. from FileSystem).
A fixed key size of 16 bytes should enable future optimizations to the
concurrent hash table for block cache, which is a heavy CPU user /
bottleneck, but there appears to be measurable performance improvement
even with no changes to LRUCache.
This change replaces a lot of disjointed and ugly code handling cache
keys with calls to a simple, clean new internal API (cache_key.h).
(Preserving the old cache key logic under an option would be very ugly
and likely negate the performance gain of the new approach. Complete
replacement carries some inherent risk, but I think that's acceptable
with sufficient analysis and testing.)
The scheme for encoding new cache keys is complicated but explained
in cache_key.cc.
Also: EndianSwapValue is moved to math.h to be next to other bit
operations. (Explains some new include "math.h".) ReverseBits operation
added and unit tests added to hash_test for both.
Fixes https://github.com/facebook/rocksdb/issues/7405 (presuming a root cause)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9126
Test Plan:
### Basic correctness
Several tests needed updates to work with the new functionality, mostly
because we are no longer relying on filesystem for stable cache keys
so table builders & readers need more context info to agree on cache
keys. This functionality is so core, a huge number of existing tests
exercise the cache key functionality.
### Performance
Create db with
`TEST_TMPDIR=/dev/shm ./db_bench -bloom_bits=10 -benchmarks=fillrandom -num=3000000 -partition_index_and_filters`
And test performance with
`TEST_TMPDIR=/dev/shm ./db_bench -readonly -use_existing_db -bloom_bits=10 -benchmarks=readrandom -num=3000000 -duration=30 -cache_index_and_filter_blocks -cache_size=250000 -threads=4`
using DEBUG_LEVEL=0 and simultaneous before & after runs.
Before ops/sec, avg over 100 runs: 121924
After ops/sec, avg over 100 runs: 125385 (+2.8%)
### Collision probability
I have built a tool, ./cache_bench -stress_cache_key to broadly simulate host-wide cache activity
over many months, by making some pessimistic simplifying assumptions:
* Every generated file has a cache entry for every byte offset in the file (contiguous range of cache keys)
* All of every file is cached for its entire lifetime
We use a simple table with skewed address assignment and replacement on address collision
to simulate files coming & going, with quite a variance (super-Poisson) in ages. Some output
with `./cache_bench -stress_cache_key -sck_keep_bits=40`:
```
Total cache or DBs size: 32TiB Writing 925.926 MiB/s or 76.2939TiB/day
Multiply by 9.22337e+18 to correct for simulation losses (but still assume whole file cached)
```
These come from default settings of 2.5M files per day of 32 MB each, and
`-sck_keep_bits=40` means that to represent a single file, we are only keeping 40 bits of
the 128-bit cache key. With file size of 2\*\*25 contiguous keys (pessimistic), our simulation
is about 2\*\*(128-40-25) or about 9 billion billion times more prone to collision than reality.
More default assumptions, relatively pessimistic:
* 100 DBs in same process (doesn't matter much)
* Re-open DB in same process (new session ID related to old session ID) on average
every 100 files generated
* Restart process (all new session IDs unrelated to old) 24 times per day
After enough data, we get a result at the end:
```
(keep 40 bits) 17 collisions after 2 x 90 days, est 10.5882 days between (9.76592e+19 corrected)
```
If we believe the (pessimistic) simulation and the mathematical generalization, we would need to run a billion machines all for 97 billion days to expect a cache key collision. To help verify that our generalization ("corrected") is robust, we can make our simulation more precise with `-sck_keep_bits=41` and `42`, which takes more running time to get enough data:
```
(keep 41 bits) 16 collisions after 4 x 90 days, est 22.5 days between (1.03763e+20 corrected)
(keep 42 bits) 19 collisions after 10 x 90 days, est 47.3684 days between (1.09224e+20 corrected)
```
The generalized prediction still holds. With the `-sck_randomize` option, we can see that we are beating "random" cache keys (except offsets still non-randomized) by a modest amount (roughly 20x less collision prone than random), which should make us reasonably comfortable even in "degenerate" cases:
```
197 collisions after 1 x 90 days, est 0.456853 days between (4.21372e+18 corrected)
```
I've run other tests to validate other conditions behave as expected, never behaving "worse than random" unless we start chopping off structured data.
Reviewed By: zhichao-cao
Differential Revision: D33171746
Pulled By: pdillinger
fbshipit-source-id: f16a57e369ed37be5e7e33525ace848d0537c88f
2021-12-17 01:13:55 +00:00
|
|
|
last_internal_comparator_ = &internal_comparator;
|
2011-03-18 22:37:00 +00:00
|
|
|
*kvmap = data_;
|
|
|
|
keys->clear();
|
2015-09-02 20:58:22 +00:00
|
|
|
for (const auto& kv : data_) {
|
|
|
|
keys->push_back(kv.first);
|
2011-03-18 22:37:00 +00:00
|
|
|
}
|
|
|
|
data_.clear();
|
2018-05-21 21:33:55 +00:00
|
|
|
Status s = FinishImpl(options, ioptions, moptions, table_options,
|
2014-09-04 23:18:36 +00:00
|
|
|
internal_comparator, *kvmap);
|
2011-03-18 22:37:00 +00:00
|
|
|
ASSERT_TRUE(s.ok()) << s.ToString();
|
|
|
|
}
|
|
|
|
|
|
|
|
// Construct the data structure from the data in "data"
|
2014-01-27 21:53:22 +00:00
|
|
|
virtual Status FinishImpl(const Options& options,
|
2021-05-05 20:59:21 +00:00
|
|
|
const ImmutableOptions& ioptions,
|
2018-05-21 21:33:55 +00:00
|
|
|
const MutableCFOptions& moptions,
|
2014-08-25 21:22:05 +00:00
|
|
|
const BlockBasedTableOptions& table_options,
|
2014-01-27 21:53:22 +00:00
|
|
|
const InternalKeyComparator& internal_comparator,
|
2015-09-02 20:58:22 +00:00
|
|
|
const stl_wrappers::KVMap& data) = 0;
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2018-05-21 21:33:55 +00:00
|
|
|
virtual InternalIterator* NewIterator(
|
|
|
|
const SliceTransform* prefix_extractor = nullptr) const = 0;
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2015-09-02 20:58:22 +00:00
|
|
|
virtual const stl_wrappers::KVMap& data() { return data_; }
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2014-09-05 00:40:41 +00:00
|
|
|
virtual bool IsArenaMode() const { return false; }
|
|
|
|
|
2013-03-01 02:04:58 +00:00
|
|
|
virtual DB* db() const { return nullptr; } // Overridden in DBConstructor
|
2011-03-21 19:40:57 +00:00
|
|
|
|
2014-09-05 00:40:41 +00:00
|
|
|
virtual bool AnywayDeleteIterator() const { return false; }
|
|
|
|
|
2014-01-27 21:53:22 +00:00
|
|
|
protected:
|
New stable, fixed-length cache keys (#9126)
Summary:
This change standardizes on a new 16-byte cache key format for
block cache (incl compressed and secondary) and persistent cache (but
not table cache and row cache).
The goal is a really fast cache key with practically ideal stability and
uniqueness properties without external dependencies (e.g. from FileSystem).
A fixed key size of 16 bytes should enable future optimizations to the
concurrent hash table for block cache, which is a heavy CPU user /
bottleneck, but there appears to be measurable performance improvement
even with no changes to LRUCache.
This change replaces a lot of disjointed and ugly code handling cache
keys with calls to a simple, clean new internal API (cache_key.h).
(Preserving the old cache key logic under an option would be very ugly
and likely negate the performance gain of the new approach. Complete
replacement carries some inherent risk, but I think that's acceptable
with sufficient analysis and testing.)
The scheme for encoding new cache keys is complicated but explained
in cache_key.cc.
Also: EndianSwapValue is moved to math.h to be next to other bit
operations. (Explains some new include "math.h".) ReverseBits operation
added and unit tests added to hash_test for both.
Fixes https://github.com/facebook/rocksdb/issues/7405 (presuming a root cause)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9126
Test Plan:
### Basic correctness
Several tests needed updates to work with the new functionality, mostly
because we are no longer relying on filesystem for stable cache keys
so table builders & readers need more context info to agree on cache
keys. This functionality is so core, a huge number of existing tests
exercise the cache key functionality.
### Performance
Create db with
`TEST_TMPDIR=/dev/shm ./db_bench -bloom_bits=10 -benchmarks=fillrandom -num=3000000 -partition_index_and_filters`
And test performance with
`TEST_TMPDIR=/dev/shm ./db_bench -readonly -use_existing_db -bloom_bits=10 -benchmarks=readrandom -num=3000000 -duration=30 -cache_index_and_filter_blocks -cache_size=250000 -threads=4`
using DEBUG_LEVEL=0 and simultaneous before & after runs.
Before ops/sec, avg over 100 runs: 121924
After ops/sec, avg over 100 runs: 125385 (+2.8%)
### Collision probability
I have built a tool, ./cache_bench -stress_cache_key to broadly simulate host-wide cache activity
over many months, by making some pessimistic simplifying assumptions:
* Every generated file has a cache entry for every byte offset in the file (contiguous range of cache keys)
* All of every file is cached for its entire lifetime
We use a simple table with skewed address assignment and replacement on address collision
to simulate files coming & going, with quite a variance (super-Poisson) in ages. Some output
with `./cache_bench -stress_cache_key -sck_keep_bits=40`:
```
Total cache or DBs size: 32TiB Writing 925.926 MiB/s or 76.2939TiB/day
Multiply by 9.22337e+18 to correct for simulation losses (but still assume whole file cached)
```
These come from default settings of 2.5M files per day of 32 MB each, and
`-sck_keep_bits=40` means that to represent a single file, we are only keeping 40 bits of
the 128-bit cache key. With file size of 2\*\*25 contiguous keys (pessimistic), our simulation
is about 2\*\*(128-40-25) or about 9 billion billion times more prone to collision than reality.
More default assumptions, relatively pessimistic:
* 100 DBs in same process (doesn't matter much)
* Re-open DB in same process (new session ID related to old session ID) on average
every 100 files generated
* Restart process (all new session IDs unrelated to old) 24 times per day
After enough data, we get a result at the end:
```
(keep 40 bits) 17 collisions after 2 x 90 days, est 10.5882 days between (9.76592e+19 corrected)
```
If we believe the (pessimistic) simulation and the mathematical generalization, we would need to run a billion machines all for 97 billion days to expect a cache key collision. To help verify that our generalization ("corrected") is robust, we can make our simulation more precise with `-sck_keep_bits=41` and `42`, which takes more running time to get enough data:
```
(keep 41 bits) 16 collisions after 4 x 90 days, est 22.5 days between (1.03763e+20 corrected)
(keep 42 bits) 19 collisions after 10 x 90 days, est 47.3684 days between (1.09224e+20 corrected)
```
The generalized prediction still holds. With the `-sck_randomize` option, we can see that we are beating "random" cache keys (except offsets still non-randomized) by a modest amount (roughly 20x less collision prone than random), which should make us reasonably comfortable even in "degenerate" cases:
```
197 collisions after 1 x 90 days, est 0.456853 days between (4.21372e+18 corrected)
```
I've run other tests to validate other conditions behave as expected, never behaving "worse than random" unless we start chopping off structured data.
Reviewed By: zhichao-cao
Differential Revision: D33171746
Pulled By: pdillinger
fbshipit-source-id: f16a57e369ed37be5e7e33525ace848d0537c88f
2021-12-17 01:13:55 +00:00
|
|
|
const InternalKeyComparator* last_internal_comparator_;
|
2014-01-27 21:53:22 +00:00
|
|
|
|
2011-03-18 22:37:00 +00:00
|
|
|
private:
|
2015-09-02 20:58:22 +00:00
|
|
|
stl_wrappers::KVMap data_;
|
2011-03-18 22:37:00 +00:00
|
|
|
};
|
|
|
|
|
2013-12-20 17:35:24 +00:00
|
|
|
// A helper class that converts internal format keys into user keys
|
2015-10-12 22:06:38 +00:00
|
|
|
class KeyConvertingIterator : public InternalIterator {
|
2011-03-18 22:37:00 +00:00
|
|
|
public:
|
2015-10-12 22:06:38 +00:00
|
|
|
explicit KeyConvertingIterator(InternalIterator* iter,
|
|
|
|
bool arena_mode = false)
|
2014-09-05 00:40:41 +00:00
|
|
|
: iter_(iter), arena_mode_(arena_mode) {}
|
2019-02-14 21:52:47 +00:00
|
|
|
~KeyConvertingIterator() override {
|
2014-09-05 00:40:41 +00:00
|
|
|
if (arena_mode_) {
|
2015-10-12 22:06:38 +00:00
|
|
|
iter_->~InternalIterator();
|
2014-09-05 00:40:41 +00:00
|
|
|
} else {
|
|
|
|
delete iter_;
|
|
|
|
}
|
|
|
|
}
|
2019-02-14 21:52:47 +00:00
|
|
|
bool Valid() const override { return iter_->Valid() && status_.ok(); }
|
|
|
|
void Seek(const Slice& target) override {
|
2013-12-20 17:35:24 +00:00
|
|
|
ParsedInternalKey ikey(target, kMaxSequenceNumber, kTypeValue);
|
|
|
|
std::string encoded;
|
|
|
|
AppendInternalKey(&encoded, ikey);
|
|
|
|
iter_->Seek(encoded);
|
|
|
|
}
|
2019-02-14 21:52:47 +00:00
|
|
|
void SeekForPrev(const Slice& target) override {
|
2016-09-28 01:20:57 +00:00
|
|
|
ParsedInternalKey ikey(target, kMaxSequenceNumber, kTypeValue);
|
|
|
|
std::string encoded;
|
|
|
|
AppendInternalKey(&encoded, ikey);
|
|
|
|
iter_->SeekForPrev(encoded);
|
|
|
|
}
|
2019-02-14 21:52:47 +00:00
|
|
|
void SeekToFirst() override { iter_->SeekToFirst(); }
|
|
|
|
void SeekToLast() override { iter_->SeekToLast(); }
|
|
|
|
void Next() override { iter_->Next(); }
|
|
|
|
void Prev() override { iter_->Prev(); }
|
2020-08-05 17:42:56 +00:00
|
|
|
IterBoundCheck UpperBoundCheckResult() override {
|
|
|
|
return iter_->UpperBoundCheckResult();
|
|
|
|
}
|
2013-12-20 17:35:24 +00:00
|
|
|
|
2019-02-14 21:52:47 +00:00
|
|
|
Slice key() const override {
|
2013-12-20 17:35:24 +00:00
|
|
|
assert(Valid());
|
2014-11-06 19:14:28 +00:00
|
|
|
ParsedInternalKey parsed_key;
|
2020-10-28 17:11:13 +00:00
|
|
|
Status pik_status =
|
|
|
|
ParseInternalKey(iter_->key(), &parsed_key, true /* log_err_key */);
|
|
|
|
if (!pik_status.ok()) {
|
|
|
|
status_ = pik_status;
|
|
|
|
return Slice(status_.getState());
|
2013-12-20 17:35:24 +00:00
|
|
|
}
|
2014-11-06 19:14:28 +00:00
|
|
|
return parsed_key.user_key;
|
2011-03-18 22:37:00 +00:00
|
|
|
}
|
2014-01-24 18:57:15 +00:00
|
|
|
|
2019-02-14 21:52:47 +00:00
|
|
|
Slice value() const override { return iter_->value(); }
|
|
|
|
Status status() const override {
|
2013-12-20 17:35:24 +00:00
|
|
|
return status_.ok() ? iter_->status() : status_;
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
mutable Status status_;
|
2015-10-12 22:06:38 +00:00
|
|
|
InternalIterator* iter_;
|
2014-09-05 00:40:41 +00:00
|
|
|
bool arena_mode_;
|
2013-12-20 17:35:24 +00:00
|
|
|
|
|
|
|
// No copying allowed
|
2023-12-01 19:15:17 +00:00
|
|
|
KeyConvertingIterator(const KeyConvertingIterator&) = delete;
|
|
|
|
void operator=(const KeyConvertingIterator&) = delete;
|
2013-12-20 17:35:24 +00:00
|
|
|
};
|
|
|
|
|
2020-07-08 00:25:08 +00:00
|
|
|
// `BlockConstructor` APIs always accept/return user keys.
|
|
|
|
class BlockConstructor : public Constructor {
|
|
|
|
public:
|
|
|
|
explicit BlockConstructor(const Comparator* cmp)
|
|
|
|
: Constructor(cmp), comparator_(cmp), block_(nullptr) {}
|
|
|
|
~BlockConstructor() override { delete block_; }
|
|
|
|
Status FinishImpl(const Options& /*options*/,
|
2021-05-05 20:59:21 +00:00
|
|
|
const ImmutableOptions& /*ioptions*/,
|
2020-07-08 00:25:08 +00:00
|
|
|
const MutableCFOptions& /*moptions*/,
|
|
|
|
const BlockBasedTableOptions& table_options,
|
|
|
|
const InternalKeyComparator& /*internal_comparator*/,
|
|
|
|
const stl_wrappers::KVMap& kv_map) override {
|
|
|
|
delete block_;
|
|
|
|
block_ = nullptr;
|
|
|
|
BlockBuilder builder(table_options.block_restart_interval);
|
|
|
|
|
|
|
|
for (const auto& kv : kv_map) {
|
|
|
|
// `DataBlockIter` assumes it reads only internal keys. `BlockConstructor`
|
|
|
|
// clients provide user keys, so we need to convert to internal key format
|
|
|
|
// before writing the data block.
|
|
|
|
ParsedInternalKey ikey(kv.first, kMaxSequenceNumber, kTypeValue);
|
|
|
|
std::string encoded;
|
|
|
|
AppendInternalKey(&encoded, ikey);
|
|
|
|
builder.Add(encoded, kv.second);
|
|
|
|
}
|
|
|
|
// Open the block
|
|
|
|
data_ = builder.Finish().ToString();
|
|
|
|
BlockContents contents;
|
|
|
|
contents.data = data_;
|
|
|
|
block_ = new Block(std::move(contents));
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
InternalIterator* NewIterator(
|
|
|
|
const SliceTransform* /*prefix_extractor*/) const override {
|
|
|
|
// `DataBlockIter` returns the internal keys it reads.
|
|
|
|
// `KeyConvertingIterator` converts them to user keys before they are
|
|
|
|
// exposed to the `BlockConstructor` clients.
|
|
|
|
return new KeyConvertingIterator(
|
|
|
|
block_->NewDataIterator(comparator_, kDisableGlobalSequenceNumber));
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
const Comparator* comparator_;
|
|
|
|
std::string data_;
|
|
|
|
Block* block_;
|
|
|
|
|
2023-12-01 19:15:17 +00:00
|
|
|
BlockConstructor() = delete;
|
2020-07-08 00:25:08 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
class TableConstructor : public Constructor {
|
2013-12-20 17:35:24 +00:00
|
|
|
public:
|
2014-01-28 18:35:48 +00:00
|
|
|
explicit TableConstructor(const Comparator* cmp,
|
2018-04-09 23:17:15 +00:00
|
|
|
bool convert_to_internal_key = false,
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-25 03:50:35 +00:00
|
|
|
int level = -1, SequenceNumber largest_seqno = 0)
|
2014-02-08 00:25:38 +00:00
|
|
|
: Constructor(cmp),
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-25 03:50:35 +00:00
|
|
|
largest_seqno_(largest_seqno),
|
2018-04-09 23:17:15 +00:00
|
|
|
convert_to_internal_key_(convert_to_internal_key),
|
2019-07-17 20:02:00 +00:00
|
|
|
level_(level) {
|
2020-02-20 20:07:53 +00:00
|
|
|
env_ = ROCKSDB_NAMESPACE::Env::Default();
|
2019-07-17 20:02:00 +00:00
|
|
|
}
|
2019-02-14 21:52:47 +00:00
|
|
|
~TableConstructor() override { Reset(); }
|
2014-01-24 18:57:15 +00:00
|
|
|
|
2021-05-05 20:59:21 +00:00
|
|
|
Status FinishImpl(const Options& options, const ImmutableOptions& ioptions,
|
2019-02-14 21:52:47 +00:00
|
|
|
const MutableCFOptions& moptions,
|
|
|
|
const BlockBasedTableOptions& /*table_options*/,
|
|
|
|
const InternalKeyComparator& internal_comparator,
|
|
|
|
const stl_wrappers::KVMap& kv_map) override {
|
2011-03-18 22:37:00 +00:00
|
|
|
Reset();
|
2015-09-16 23:57:43 +00:00
|
|
|
soptions.use_mmap_reads = ioptions.allow_mmap_reads;
|
2021-01-04 23:59:52 +00:00
|
|
|
std::unique_ptr<FSWritableFile> sink(new test::StringSink());
|
|
|
|
file_writer_.reset(new WritableFileWriter(
|
|
|
|
std::move(sink), "" /* don't care */, FileOptions()));
|
2018-11-09 19:17:34 +00:00
|
|
|
std::unique_ptr<TableBuilder> builder;
|
2024-02-02 22:14:43 +00:00
|
|
|
InternalTblPropCollFactories internal_tbl_prop_coll_factories;
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-25 03:50:35 +00:00
|
|
|
|
|
|
|
if (largest_seqno_ != 0) {
|
|
|
|
// Pretend that it's an external file written by SstFileWriter.
|
2024-02-02 22:14:43 +00:00
|
|
|
internal_tbl_prop_coll_factories.emplace_back(
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-25 03:50:35 +00:00
|
|
|
new SstFileWriterPropertiesCollectorFactory(2 /* version */,
|
|
|
|
0 /* global_seqno*/));
|
|
|
|
}
|
|
|
|
|
2016-04-07 06:10:32 +00:00
|
|
|
std::string column_family_name;
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
const ReadOptions read_options;
|
|
|
|
const WriteOptions write_options;
|
2024-10-17 21:13:20 +00:00
|
|
|
builder.reset(moptions.table_factory->NewTableBuilder(
|
2024-11-01 17:08:35 +00:00
|
|
|
TableBuilderOptions(
|
|
|
|
ioptions, moptions, read_options, write_options,
|
|
|
|
internal_comparator, &internal_tbl_prop_coll_factories,
|
|
|
|
options.compression, options.compression_opts, kUnknownColumnFamily,
|
|
|
|
column_family_name, level_, kUnknownNewestKeyTime),
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-17 23:16:11 +00:00
|
|
|
file_writer_.get()));
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2020-06-24 23:20:55 +00:00
|
|
|
for (const auto& kv : kv_map) {
|
2013-12-20 17:35:24 +00:00
|
|
|
if (convert_to_internal_key_) {
|
2014-11-06 19:14:28 +00:00
|
|
|
ParsedInternalKey ikey(kv.first, kMaxSequenceNumber, kTypeValue);
|
2013-12-20 17:35:24 +00:00
|
|
|
std::string encoded;
|
|
|
|
AppendInternalKey(&encoded, ikey);
|
2014-11-06 19:14:28 +00:00
|
|
|
builder->Add(encoded, kv.second);
|
2013-12-20 17:35:24 +00:00
|
|
|
} else {
|
2014-11-06 19:14:28 +00:00
|
|
|
builder->Add(kv.first, kv.second);
|
2013-12-20 17:35:24 +00:00
|
|
|
}
|
2020-12-24 00:54:05 +00:00
|
|
|
EXPECT_OK(builder->status());
|
2011-03-18 22:37:00 +00:00
|
|
|
}
|
2013-12-20 17:35:24 +00:00
|
|
|
Status s = builder->Finish();
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
EXPECT_OK(file_writer_->Flush(IOOptions()));
|
rocksdb: Replace ASSERT* with EXPECT* in functions that does not return void value
Summary:
gtest does not use exceptions to fail a unit test by design, and `ASSERT*`s are implemented using `return`. As a consequence we cannot use `ASSERT*` in a function that does not return `void` value ([[ https://code.google.com/p/googletest/wiki/AdvancedGuide#Assertion_Placement | 1]]), and have to fix our existing code. This diff does this in a generic way, with no manual changes.
In order to detect all existing `ASSERT*` that are used in functions that doesn't return void value, I change the code to generate compile errors for such cases.
In `util/testharness.h` I defined `EXPECT*` assertions, the same way as `ASSERT*`, and redefined `ASSERT*` to return `void`. Then executed:
```lang=bash
% USE_CLANG=1 make all -j55 -k 2> build.log
% perl -naF: -e 'print "-- -number=".$F[1]." ".$F[0]."\n" if /: error:/' \
build.log | xargs -L 1 perl -spi -e 's/ASSERT/EXPECT/g if $. == $number'
% make format
```
After that I reverted back change to `ASSERT*` in `util/testharness.h`. But preserved introduced `EXPECT*`, which is the same as `ASSERT*`. This will be deleted once switched to gtest.
This diff is independent and contains manual changes only in `util/testharness.h`.
Test Plan:
Make sure all tests are passing.
```lang=bash
% USE_CLANG=1 make check
```
Reviewers: igor, lgalanis, sdong, yufei.zhu, rven, meyering
Reviewed By: meyering
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D33333
2015-03-17 03:52:32 +00:00
|
|
|
EXPECT_TRUE(s.ok()) << s.ToString();
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2018-07-20 16:00:33 +00:00
|
|
|
EXPECT_EQ(TEST_GetSink()->contents().size(), builder->FileSize());
|
2011-03-18 22:37:00 +00:00
|
|
|
|
|
|
|
// Open the table
|
Always verify SST unique IDs on SST file open (#10532)
Summary:
Although we've been tracking SST unique IDs in the DB manifest
unconditionally, checking has been opt-in and with an extra pass at DB::Open
time. This changes the behavior of `verify_sst_unique_id_in_manifest` to
check unique ID against manifest every time an SST file is opened through
table cache (normal DB operations), replacing the explicit pass over files
at DB::Open time. This change also enables the option by default and
removes the "EXPERIMENTAL" designation.
One possible criticism is that the option no longer ensures the integrity
of a DB at Open time. This is far from an all-or-nothing issue. Verifying
the IDs of all SST files hardly ensures all the data in the DB is readable.
(VerifyChecksum is supposed to do that.) Also, with
max_open_files=-1 (default, extremely common), all SST files are
opened at DB::Open time anyway.
Implementation details:
* `VerifySstUniqueIdInManifest()` functions are the extra/explicit pass
that is now removed.
* Unit tests that manipulate/corrupt table properties have to opt out of
this check, because that corrupts the "actual" unique id. (And even for
testing we don't currently have a mechanism to set "no unique id"
in the in-memory file metadata for new files.)
* A lot of other unit test churn relates to (a) default checking on, and
(b) checking on SST open even without DB::Open (e.g. on flush)
* Use `FileMetaData` for more `TableCache` operations (in place of
`FileDescriptor`) so that we have access to the unique_id whenever
we might need to open an SST file. **There is the possibility of
performance impact because we can no longer use the more
localized `fd` part of an `FdWithKeyRange` but instead follow the
`file_metadata` pointer. However, this change (possible regression)
is only done for `GetMemoryUsageByTableReaders`.**
* Removed a completely unnecessary constructor overload of
`TableReaderOptions`
Possible follow-up:
* Verification only happens when opening through table cache. Are there
more places where this should happen?
* Improve error message when there is a file size mismatch vs. manifest
(FIXME added in the appropriate place).
* I'm not sure there's a justification for `FileDescriptor` to be distinct from
`FileMetaData`.
* I'm skeptical that `FdWithKeyRange` really still makes sense for
optimizing some data locality by duplicating some data in memory, but I
could be wrong.
* An unnecessary overload of NewTableReader was recently added, in
the public API nonetheless (though unusable there). It should be cleaned
up to put most things under `TableReaderOptions`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10532
Test Plan:
updated unit tests
Performance test showing no significant difference (just noise I think):
`./db_bench -benchmarks=readwhilewriting[-X10] -num=3000000 -disable_wal=1 -bloom_bits=8 -write_buffer_size=1000000 -target_file_size_base=1000000`
Before: readwhilewriting [AVG 10 runs] : 68702 (± 6932) ops/sec
After: readwhilewriting [AVG 10 runs] : 68239 (± 7198) ops/sec
Reviewed By: jay-zhuang
Differential Revision: D38765551
Pulled By: pdillinger
fbshipit-source-id: a827a708155f12344ab2a5c16e7701c7636da4c2
2022-09-08 05:52:42 +00:00
|
|
|
file_num_ = cur_file_num_++;
|
2021-01-04 23:59:52 +00:00
|
|
|
|
New stable, fixed-length cache keys (#9126)
Summary:
This change standardizes on a new 16-byte cache key format for
block cache (incl compressed and secondary) and persistent cache (but
not table cache and row cache).
The goal is a really fast cache key with practically ideal stability and
uniqueness properties without external dependencies (e.g. from FileSystem).
A fixed key size of 16 bytes should enable future optimizations to the
concurrent hash table for block cache, which is a heavy CPU user /
bottleneck, but there appears to be measurable performance improvement
even with no changes to LRUCache.
This change replaces a lot of disjointed and ugly code handling cache
keys with calls to a simple, clean new internal API (cache_key.h).
(Preserving the old cache key logic under an option would be very ugly
and likely negate the performance gain of the new approach. Complete
replacement carries some inherent risk, but I think that's acceptable
with sufficient analysis and testing.)
The scheme for encoding new cache keys is complicated but explained
in cache_key.cc.
Also: EndianSwapValue is moved to math.h to be next to other bit
operations. (Explains some new include "math.h".) ReverseBits operation
added and unit tests added to hash_test for both.
Fixes https://github.com/facebook/rocksdb/issues/7405 (presuming a root cause)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9126
Test Plan:
### Basic correctness
Several tests needed updates to work with the new functionality, mostly
because we are no longer relying on filesystem for stable cache keys
so table builders & readers need more context info to agree on cache
keys. This functionality is so core, a huge number of existing tests
exercise the cache key functionality.
### Performance
Create db with
`TEST_TMPDIR=/dev/shm ./db_bench -bloom_bits=10 -benchmarks=fillrandom -num=3000000 -partition_index_and_filters`
And test performance with
`TEST_TMPDIR=/dev/shm ./db_bench -readonly -use_existing_db -bloom_bits=10 -benchmarks=readrandom -num=3000000 -duration=30 -cache_index_and_filter_blocks -cache_size=250000 -threads=4`
using DEBUG_LEVEL=0 and simultaneous before & after runs.
Before ops/sec, avg over 100 runs: 121924
After ops/sec, avg over 100 runs: 125385 (+2.8%)
### Collision probability
I have built a tool, ./cache_bench -stress_cache_key to broadly simulate host-wide cache activity
over many months, by making some pessimistic simplifying assumptions:
* Every generated file has a cache entry for every byte offset in the file (contiguous range of cache keys)
* All of every file is cached for its entire lifetime
We use a simple table with skewed address assignment and replacement on address collision
to simulate files coming & going, with quite a variance (super-Poisson) in ages. Some output
with `./cache_bench -stress_cache_key -sck_keep_bits=40`:
```
Total cache or DBs size: 32TiB Writing 925.926 MiB/s or 76.2939TiB/day
Multiply by 9.22337e+18 to correct for simulation losses (but still assume whole file cached)
```
These come from default settings of 2.5M files per day of 32 MB each, and
`-sck_keep_bits=40` means that to represent a single file, we are only keeping 40 bits of
the 128-bit cache key. With file size of 2\*\*25 contiguous keys (pessimistic), our simulation
is about 2\*\*(128-40-25) or about 9 billion billion times more prone to collision than reality.
More default assumptions, relatively pessimistic:
* 100 DBs in same process (doesn't matter much)
* Re-open DB in same process (new session ID related to old session ID) on average
every 100 files generated
* Restart process (all new session IDs unrelated to old) 24 times per day
After enough data, we get a result at the end:
```
(keep 40 bits) 17 collisions after 2 x 90 days, est 10.5882 days between (9.76592e+19 corrected)
```
If we believe the (pessimistic) simulation and the mathematical generalization, we would need to run a billion machines all for 97 billion days to expect a cache key collision. To help verify that our generalization ("corrected") is robust, we can make our simulation more precise with `-sck_keep_bits=41` and `42`, which takes more running time to get enough data:
```
(keep 41 bits) 16 collisions after 4 x 90 days, est 22.5 days between (1.03763e+20 corrected)
(keep 42 bits) 19 collisions after 10 x 90 days, est 47.3684 days between (1.09224e+20 corrected)
```
The generalized prediction still holds. With the `-sck_randomize` option, we can see that we are beating "random" cache keys (except offsets still non-randomized) by a modest amount (roughly 20x less collision prone than random), which should make us reasonably comfortable even in "degenerate" cases:
```
197 collisions after 1 x 90 days, est 0.456853 days between (4.21372e+18 corrected)
```
I've run other tests to validate other conditions behave as expected, never behaving "worse than random" unless we start chopping off structured data.
Reviewed By: zhichao-cao
Differential Revision: D33171746
Pulled By: pdillinger
fbshipit-source-id: f16a57e369ed37be5e7e33525ace848d0537c88f
2021-12-17 01:13:55 +00:00
|
|
|
return Reopen(ioptions, moptions);
|
2011-03-18 22:37:00 +00:00
|
|
|
}
|
|
|
|
|
2019-02-14 21:52:47 +00:00
|
|
|
InternalIterator* NewIterator(
|
2018-05-21 21:33:55 +00:00
|
|
|
const SliceTransform* prefix_extractor) const override {
|
2019-06-20 21:28:22 +00:00
|
|
|
InternalIterator* iter = table_reader_->NewIterator(
|
2020-08-03 22:21:56 +00:00
|
|
|
read_options_, prefix_extractor, /*arena=*/nullptr,
|
|
|
|
/*skip_filters=*/false, TableReaderCaller::kUncategorized);
|
2013-12-20 17:35:24 +00:00
|
|
|
if (convert_to_internal_key_) {
|
|
|
|
return new KeyConvertingIterator(iter);
|
|
|
|
} else {
|
|
|
|
return iter;
|
|
|
|
}
|
2011-03-18 22:37:00 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t ApproximateOffsetOf(const Slice& key) const {
|
2023-04-21 16:07:18 +00:00
|
|
|
const ReadOptions read_options;
|
2016-08-19 22:10:31 +00:00
|
|
|
if (convert_to_internal_key_) {
|
|
|
|
InternalKey ikey(key, kMaxSequenceNumber, kTypeValue);
|
|
|
|
const Slice skey = ikey.Encode();
|
2019-06-20 21:28:22 +00:00
|
|
|
return table_reader_->ApproximateOffsetOf(
|
2023-04-21 16:07:18 +00:00
|
|
|
read_options, skey, TableReaderCaller::kUncategorized);
|
2016-08-19 22:10:31 +00:00
|
|
|
}
|
2019-06-20 21:28:22 +00:00
|
|
|
return table_reader_->ApproximateOffsetOf(
|
2023-04-21 16:07:18 +00:00
|
|
|
read_options, key, TableReaderCaller::kUncategorized);
|
2011-03-18 22:37:00 +00:00
|
|
|
}
|
|
|
|
|
2021-05-05 20:59:21 +00:00
|
|
|
virtual Status Reopen(const ImmutableOptions& ioptions,
|
2018-05-21 21:33:55 +00:00
|
|
|
const MutableCFOptions& moptions) {
|
2021-01-04 23:59:52 +00:00
|
|
|
std::unique_ptr<FSRandomAccessFile> source(new test::StringSource(
|
Always verify SST unique IDs on SST file open (#10532)
Summary:
Although we've been tracking SST unique IDs in the DB manifest
unconditionally, checking has been opt-in and with an extra pass at DB::Open
time. This changes the behavior of `verify_sst_unique_id_in_manifest` to
check unique ID against manifest every time an SST file is opened through
table cache (normal DB operations), replacing the explicit pass over files
at DB::Open time. This change also enables the option by default and
removes the "EXPERIMENTAL" designation.
One possible criticism is that the option no longer ensures the integrity
of a DB at Open time. This is far from an all-or-nothing issue. Verifying
the IDs of all SST files hardly ensures all the data in the DB is readable.
(VerifyChecksum is supposed to do that.) Also, with
max_open_files=-1 (default, extremely common), all SST files are
opened at DB::Open time anyway.
Implementation details:
* `VerifySstUniqueIdInManifest()` functions are the extra/explicit pass
that is now removed.
* Unit tests that manipulate/corrupt table properties have to opt out of
this check, because that corrupts the "actual" unique id. (And even for
testing we don't currently have a mechanism to set "no unique id"
in the in-memory file metadata for new files.)
* A lot of other unit test churn relates to (a) default checking on, and
(b) checking on SST open even without DB::Open (e.g. on flush)
* Use `FileMetaData` for more `TableCache` operations (in place of
`FileDescriptor`) so that we have access to the unique_id whenever
we might need to open an SST file. **There is the possibility of
performance impact because we can no longer use the more
localized `fd` part of an `FdWithKeyRange` but instead follow the
`file_metadata` pointer. However, this change (possible regression)
is only done for `GetMemoryUsageByTableReaders`.**
* Removed a completely unnecessary constructor overload of
`TableReaderOptions`
Possible follow-up:
* Verification only happens when opening through table cache. Are there
more places where this should happen?
* Improve error message when there is a file size mismatch vs. manifest
(FIXME added in the appropriate place).
* I'm not sure there's a justification for `FileDescriptor` to be distinct from
`FileMetaData`.
* I'm skeptical that `FdWithKeyRange` really still makes sense for
optimizing some data locality by duplicating some data in memory, but I
could be wrong.
* An unnecessary overload of NewTableReader was recently added, in
the public API nonetheless (though unusable there). It should be cleaned
up to put most things under `TableReaderOptions`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10532
Test Plan:
updated unit tests
Performance test showing no significant difference (just noise I think):
`./db_bench -benchmarks=readwhilewriting[-X10] -num=3000000 -disable_wal=1 -bloom_bits=8 -write_buffer_size=1000000 -target_file_size_base=1000000`
Before: readwhilewriting [AVG 10 runs] : 68702 (± 6932) ops/sec
After: readwhilewriting [AVG 10 runs] : 68239 (± 7198) ops/sec
Reviewed By: jay-zhuang
Differential Revision: D38765551
Pulled By: pdillinger
fbshipit-source-id: a827a708155f12344ab2a5c16e7701c7636da4c2
2022-09-08 05:52:42 +00:00
|
|
|
TEST_GetSink()->contents(), file_num_, ioptions.allow_mmap_reads));
|
2021-01-04 23:59:52 +00:00
|
|
|
|
|
|
|
file_reader_.reset(new RandomAccessFileReader(std::move(source), "test"));
|
2024-10-17 21:13:20 +00:00
|
|
|
return moptions.table_factory->NewTableReader(
|
2022-01-21 19:36:36 +00:00
|
|
|
TableReaderOptions(ioptions, moptions.prefix_extractor, soptions,
|
2023-04-25 19:08:23 +00:00
|
|
|
*last_internal_comparator_,
|
|
|
|
0 /* block_protection_bytes_per_key */,
|
|
|
|
/*skip_filters*/ false,
|
Always verify SST unique IDs on SST file open (#10532)
Summary:
Although we've been tracking SST unique IDs in the DB manifest
unconditionally, checking has been opt-in and with an extra pass at DB::Open
time. This changes the behavior of `verify_sst_unique_id_in_manifest` to
check unique ID against manifest every time an SST file is opened through
table cache (normal DB operations), replacing the explicit pass over files
at DB::Open time. This change also enables the option by default and
removes the "EXPERIMENTAL" designation.
One possible criticism is that the option no longer ensures the integrity
of a DB at Open time. This is far from an all-or-nothing issue. Verifying
the IDs of all SST files hardly ensures all the data in the DB is readable.
(VerifyChecksum is supposed to do that.) Also, with
max_open_files=-1 (default, extremely common), all SST files are
opened at DB::Open time anyway.
Implementation details:
* `VerifySstUniqueIdInManifest()` functions are the extra/explicit pass
that is now removed.
* Unit tests that manipulate/corrupt table properties have to opt out of
this check, because that corrupts the "actual" unique id. (And even for
testing we don't currently have a mechanism to set "no unique id"
in the in-memory file metadata for new files.)
* A lot of other unit test churn relates to (a) default checking on, and
(b) checking on SST open even without DB::Open (e.g. on flush)
* Use `FileMetaData` for more `TableCache` operations (in place of
`FileDescriptor`) so that we have access to the unique_id whenever
we might need to open an SST file. **There is the possibility of
performance impact because we can no longer use the more
localized `fd` part of an `FdWithKeyRange` but instead follow the
`file_metadata` pointer. However, this change (possible regression)
is only done for `GetMemoryUsageByTableReaders`.**
* Removed a completely unnecessary constructor overload of
`TableReaderOptions`
Possible follow-up:
* Verification only happens when opening through table cache. Are there
more places where this should happen?
* Improve error message when there is a file size mismatch vs. manifest
(FIXME added in the appropriate place).
* I'm not sure there's a justification for `FileDescriptor` to be distinct from
`FileMetaData`.
* I'm skeptical that `FdWithKeyRange` really still makes sense for
optimizing some data locality by duplicating some data in memory, but I
could be wrong.
* An unnecessary overload of NewTableReader was recently added, in
the public API nonetheless (though unusable there). It should be cleaned
up to put most things under `TableReaderOptions`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10532
Test Plan:
updated unit tests
Performance test showing no significant difference (just noise I think):
`./db_bench -benchmarks=readwhilewriting[-X10] -num=3000000 -disable_wal=1 -bloom_bits=8 -write_buffer_size=1000000 -target_file_size_base=1000000`
Before: readwhilewriting [AVG 10 runs] : 68702 (± 6932) ops/sec
After: readwhilewriting [AVG 10 runs] : 68239 (± 7198) ops/sec
Reviewed By: jay-zhuang
Differential Revision: D38765551
Pulled By: pdillinger
fbshipit-source-id: a827a708155f12344ab2a5c16e7701c7636da4c2
2022-09-08 05:52:42 +00:00
|
|
|
/*immortal*/ false, false, level_,
|
New stable, fixed-length cache keys (#9126)
Summary:
This change standardizes on a new 16-byte cache key format for
block cache (incl compressed and secondary) and persistent cache (but
not table cache and row cache).
The goal is a really fast cache key with practically ideal stability and
uniqueness properties without external dependencies (e.g. from FileSystem).
A fixed key size of 16 bytes should enable future optimizations to the
concurrent hash table for block cache, which is a heavy CPU user /
bottleneck, but there appears to be measurable performance improvement
even with no changes to LRUCache.
This change replaces a lot of disjointed and ugly code handling cache
keys with calls to a simple, clean new internal API (cache_key.h).
(Preserving the old cache key logic under an option would be very ugly
and likely negate the performance gain of the new approach. Complete
replacement carries some inherent risk, but I think that's acceptable
with sufficient analysis and testing.)
The scheme for encoding new cache keys is complicated but explained
in cache_key.cc.
Also: EndianSwapValue is moved to math.h to be next to other bit
operations. (Explains some new include "math.h".) ReverseBits operation
added and unit tests added to hash_test for both.
Fixes https://github.com/facebook/rocksdb/issues/7405 (presuming a root cause)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9126
Test Plan:
### Basic correctness
Several tests needed updates to work with the new functionality, mostly
because we are no longer relying on filesystem for stable cache keys
so table builders & readers need more context info to agree on cache
keys. This functionality is so core, a huge number of existing tests
exercise the cache key functionality.
### Performance
Create db with
`TEST_TMPDIR=/dev/shm ./db_bench -bloom_bits=10 -benchmarks=fillrandom -num=3000000 -partition_index_and_filters`
And test performance with
`TEST_TMPDIR=/dev/shm ./db_bench -readonly -use_existing_db -bloom_bits=10 -benchmarks=readrandom -num=3000000 -duration=30 -cache_index_and_filter_blocks -cache_size=250000 -threads=4`
using DEBUG_LEVEL=0 and simultaneous before & after runs.
Before ops/sec, avg over 100 runs: 121924
After ops/sec, avg over 100 runs: 125385 (+2.8%)
### Collision probability
I have built a tool, ./cache_bench -stress_cache_key to broadly simulate host-wide cache activity
over many months, by making some pessimistic simplifying assumptions:
* Every generated file has a cache entry for every byte offset in the file (contiguous range of cache keys)
* All of every file is cached for its entire lifetime
We use a simple table with skewed address assignment and replacement on address collision
to simulate files coming & going, with quite a variance (super-Poisson) in ages. Some output
with `./cache_bench -stress_cache_key -sck_keep_bits=40`:
```
Total cache or DBs size: 32TiB Writing 925.926 MiB/s or 76.2939TiB/day
Multiply by 9.22337e+18 to correct for simulation losses (but still assume whole file cached)
```
These come from default settings of 2.5M files per day of 32 MB each, and
`-sck_keep_bits=40` means that to represent a single file, we are only keeping 40 bits of
the 128-bit cache key. With file size of 2\*\*25 contiguous keys (pessimistic), our simulation
is about 2\*\*(128-40-25) or about 9 billion billion times more prone to collision than reality.
More default assumptions, relatively pessimistic:
* 100 DBs in same process (doesn't matter much)
* Re-open DB in same process (new session ID related to old session ID) on average
every 100 files generated
* Restart process (all new session IDs unrelated to old) 24 times per day
After enough data, we get a result at the end:
```
(keep 40 bits) 17 collisions after 2 x 90 days, est 10.5882 days between (9.76592e+19 corrected)
```
If we believe the (pessimistic) simulation and the mathematical generalization, we would need to run a billion machines all for 97 billion days to expect a cache key collision. To help verify that our generalization ("corrected") is robust, we can make our simulation more precise with `-sck_keep_bits=41` and `42`, which takes more running time to get enough data:
```
(keep 41 bits) 16 collisions after 4 x 90 days, est 22.5 days between (1.03763e+20 corrected)
(keep 42 bits) 19 collisions after 10 x 90 days, est 47.3684 days between (1.09224e+20 corrected)
```
The generalized prediction still holds. With the `-sck_randomize` option, we can see that we are beating "random" cache keys (except offsets still non-randomized) by a modest amount (roughly 20x less collision prone than random), which should make us reasonably comfortable even in "degenerate" cases:
```
197 collisions after 1 x 90 days, est 0.456853 days between (4.21372e+18 corrected)
```
I've run other tests to validate other conditions behave as expected, never behaving "worse than random" unless we start chopping off structured data.
Reviewed By: zhichao-cao
Differential Revision: D33171746
Pulled By: pdillinger
fbshipit-source-id: f16a57e369ed37be5e7e33525ace848d0537c88f
2021-12-17 01:13:55 +00:00
|
|
|
&block_cache_tracer_, moptions.write_buffer_size, "",
|
Always verify SST unique IDs on SST file open (#10532)
Summary:
Although we've been tracking SST unique IDs in the DB manifest
unconditionally, checking has been opt-in and with an extra pass at DB::Open
time. This changes the behavior of `verify_sst_unique_id_in_manifest` to
check unique ID against manifest every time an SST file is opened through
table cache (normal DB operations), replacing the explicit pass over files
at DB::Open time. This change also enables the option by default and
removes the "EXPERIMENTAL" designation.
One possible criticism is that the option no longer ensures the integrity
of a DB at Open time. This is far from an all-or-nothing issue. Verifying
the IDs of all SST files hardly ensures all the data in the DB is readable.
(VerifyChecksum is supposed to do that.) Also, with
max_open_files=-1 (default, extremely common), all SST files are
opened at DB::Open time anyway.
Implementation details:
* `VerifySstUniqueIdInManifest()` functions are the extra/explicit pass
that is now removed.
* Unit tests that manipulate/corrupt table properties have to opt out of
this check, because that corrupts the "actual" unique id. (And even for
testing we don't currently have a mechanism to set "no unique id"
in the in-memory file metadata for new files.)
* A lot of other unit test churn relates to (a) default checking on, and
(b) checking on SST open even without DB::Open (e.g. on flush)
* Use `FileMetaData` for more `TableCache` operations (in place of
`FileDescriptor`) so that we have access to the unique_id whenever
we might need to open an SST file. **There is the possibility of
performance impact because we can no longer use the more
localized `fd` part of an `FdWithKeyRange` but instead follow the
`file_metadata` pointer. However, this change (possible regression)
is only done for `GetMemoryUsageByTableReaders`.**
* Removed a completely unnecessary constructor overload of
`TableReaderOptions`
Possible follow-up:
* Verification only happens when opening through table cache. Are there
more places where this should happen?
* Improve error message when there is a file size mismatch vs. manifest
(FIXME added in the appropriate place).
* I'm not sure there's a justification for `FileDescriptor` to be distinct from
`FileMetaData`.
* I'm skeptical that `FdWithKeyRange` really still makes sense for
optimizing some data locality by duplicating some data in memory, but I
could be wrong.
* An unnecessary overload of NewTableReader was recently added, in
the public API nonetheless (though unusable there). It should be cleaned
up to put most things under `TableReaderOptions`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10532
Test Plan:
updated unit tests
Performance test showing no significant difference (just noise I think):
`./db_bench -benchmarks=readwhilewriting[-X10] -num=3000000 -disable_wal=1 -bloom_bits=8 -write_buffer_size=1000000 -target_file_size_base=1000000`
Before: readwhilewriting [AVG 10 runs] : 68702 (± 6932) ops/sec
After: readwhilewriting [AVG 10 runs] : 68239 (± 7198) ops/sec
Reviewed By: jay-zhuang
Differential Revision: D38765551
Pulled By: pdillinger
fbshipit-source-id: a827a708155f12344ab2a5c16e7701c7636da4c2
2022-09-08 05:52:42 +00:00
|
|
|
file_num_, kNullUniqueId64x2, largest_seqno_),
|
2018-07-20 21:34:07 +00:00
|
|
|
std::move(file_reader_), TEST_GetSink()->contents().size(),
|
|
|
|
&table_reader_);
|
2013-01-31 23:20:24 +00:00
|
|
|
}
|
|
|
|
|
2018-07-20 21:34:07 +00:00
|
|
|
virtual TableReader* GetTableReader() { return table_reader_.get(); }
|
2013-01-31 23:20:24 +00:00
|
|
|
|
2019-02-14 21:52:47 +00:00
|
|
|
bool AnywayDeleteIterator() const override {
|
2014-09-05 00:40:41 +00:00
|
|
|
return convert_to_internal_key_;
|
|
|
|
}
|
|
|
|
|
Adding pin_l0_filter_and_index_blocks_in_cache feature and related fixes.
Summary:
When a block based table file is opened, if prefetch_index_and_filter is true, it will prefetch the index and filter blocks, putting them into the block cache.
What this feature adds: when a L0 block based table file is opened, if pin_l0_filter_and_index_blocks_in_cache is true in the options (and prefetch_index_and_filter is true), then the filter and index blocks aren't released back to the block cache at the end of BlockBasedTableReader::Open(). Instead the table reader takes ownership of them, hence pinning them, ie. the LRU cache will never push them out. Meanwhile in the table reader, further accesses will not hit the block cache, thus avoiding lock contention.
Test Plan:
'export TEST_TMPDIR=/dev/shm/ && DISABLE_JEMALLOC=1 OPT=-g make all valgrind_check -j32' is OK.
I didn't run the Java tests, I don't have Java set up on my devserver.
Reviewers: sdong
Reviewed By: sdong
Subscribers: andrewkr, dhruba
Differential Revision: https://reviews.facebook.net/D56133
2016-04-01 17:42:39 +00:00
|
|
|
void ResetTableReader() { table_reader_.reset(); }
|
|
|
|
|
2016-08-19 22:10:31 +00:00
|
|
|
bool ConvertToInternalKey() { return convert_to_internal_key_; }
|
|
|
|
|
2018-07-20 16:00:33 +00:00
|
|
|
test::StringSink* TEST_GetSink() {
|
2021-01-04 23:59:52 +00:00
|
|
|
return static_cast<test::StringSink*>(file_writer_->writable_file());
|
2018-07-20 16:00:33 +00:00
|
|
|
}
|
|
|
|
|
2019-07-17 20:02:00 +00:00
|
|
|
BlockCacheTracer block_cache_tracer_;
|
2023-12-06 21:48:15 +00:00
|
|
|
Env* env_;
|
2019-07-17 20:02:00 +00:00
|
|
|
|
2011-03-18 22:37:00 +00:00
|
|
|
private:
|
|
|
|
void Reset() {
|
Always verify SST unique IDs on SST file open (#10532)
Summary:
Although we've been tracking SST unique IDs in the DB manifest
unconditionally, checking has been opt-in and with an extra pass at DB::Open
time. This changes the behavior of `verify_sst_unique_id_in_manifest` to
check unique ID against manifest every time an SST file is opened through
table cache (normal DB operations), replacing the explicit pass over files
at DB::Open time. This change also enables the option by default and
removes the "EXPERIMENTAL" designation.
One possible criticism is that the option no longer ensures the integrity
of a DB at Open time. This is far from an all-or-nothing issue. Verifying
the IDs of all SST files hardly ensures all the data in the DB is readable.
(VerifyChecksum is supposed to do that.) Also, with
max_open_files=-1 (default, extremely common), all SST files are
opened at DB::Open time anyway.
Implementation details:
* `VerifySstUniqueIdInManifest()` functions are the extra/explicit pass
that is now removed.
* Unit tests that manipulate/corrupt table properties have to opt out of
this check, because that corrupts the "actual" unique id. (And even for
testing we don't currently have a mechanism to set "no unique id"
in the in-memory file metadata for new files.)
* A lot of other unit test churn relates to (a) default checking on, and
(b) checking on SST open even without DB::Open (e.g. on flush)
* Use `FileMetaData` for more `TableCache` operations (in place of
`FileDescriptor`) so that we have access to the unique_id whenever
we might need to open an SST file. **There is the possibility of
performance impact because we can no longer use the more
localized `fd` part of an `FdWithKeyRange` but instead follow the
`file_metadata` pointer. However, this change (possible regression)
is only done for `GetMemoryUsageByTableReaders`.**
* Removed a completely unnecessary constructor overload of
`TableReaderOptions`
Possible follow-up:
* Verification only happens when opening through table cache. Are there
more places where this should happen?
* Improve error message when there is a file size mismatch vs. manifest
(FIXME added in the appropriate place).
* I'm not sure there's a justification for `FileDescriptor` to be distinct from
`FileMetaData`.
* I'm skeptical that `FdWithKeyRange` really still makes sense for
optimizing some data locality by duplicating some data in memory, but I
could be wrong.
* An unnecessary overload of NewTableReader was recently added, in
the public API nonetheless (though unusable there). It should be cleaned
up to put most things under `TableReaderOptions`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10532
Test Plan:
updated unit tests
Performance test showing no significant difference (just noise I think):
`./db_bench -benchmarks=readwhilewriting[-X10] -num=3000000 -disable_wal=1 -bloom_bits=8 -write_buffer_size=1000000 -target_file_size_base=1000000`
Before: readwhilewriting [AVG 10 runs] : 68702 (± 6932) ops/sec
After: readwhilewriting [AVG 10 runs] : 68239 (± 7198) ops/sec
Reviewed By: jay-zhuang
Differential Revision: D38765551
Pulled By: pdillinger
fbshipit-source-id: a827a708155f12344ab2a5c16e7701c7636da4c2
2022-09-08 05:52:42 +00:00
|
|
|
file_num_ = 0;
|
2013-10-30 17:52:33 +00:00
|
|
|
table_reader_.reset();
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-17 23:16:11 +00:00
|
|
|
file_writer_.reset();
|
|
|
|
file_reader_.reset();
|
|
|
|
}
|
|
|
|
|
2020-08-03 22:21:56 +00:00
|
|
|
const ReadOptions read_options_;
|
Always verify SST unique IDs on SST file open (#10532)
Summary:
Although we've been tracking SST unique IDs in the DB manifest
unconditionally, checking has been opt-in and with an extra pass at DB::Open
time. This changes the behavior of `verify_sst_unique_id_in_manifest` to
check unique ID against manifest every time an SST file is opened through
table cache (normal DB operations), replacing the explicit pass over files
at DB::Open time. This change also enables the option by default and
removes the "EXPERIMENTAL" designation.
One possible criticism is that the option no longer ensures the integrity
of a DB at Open time. This is far from an all-or-nothing issue. Verifying
the IDs of all SST files hardly ensures all the data in the DB is readable.
(VerifyChecksum is supposed to do that.) Also, with
max_open_files=-1 (default, extremely common), all SST files are
opened at DB::Open time anyway.
Implementation details:
* `VerifySstUniqueIdInManifest()` functions are the extra/explicit pass
that is now removed.
* Unit tests that manipulate/corrupt table properties have to opt out of
this check, because that corrupts the "actual" unique id. (And even for
testing we don't currently have a mechanism to set "no unique id"
in the in-memory file metadata for new files.)
* A lot of other unit test churn relates to (a) default checking on, and
(b) checking on SST open even without DB::Open (e.g. on flush)
* Use `FileMetaData` for more `TableCache` operations (in place of
`FileDescriptor`) so that we have access to the unique_id whenever
we might need to open an SST file. **There is the possibility of
performance impact because we can no longer use the more
localized `fd` part of an `FdWithKeyRange` but instead follow the
`file_metadata` pointer. However, this change (possible regression)
is only done for `GetMemoryUsageByTableReaders`.**
* Removed a completely unnecessary constructor overload of
`TableReaderOptions`
Possible follow-up:
* Verification only happens when opening through table cache. Are there
more places where this should happen?
* Improve error message when there is a file size mismatch vs. manifest
(FIXME added in the appropriate place).
* I'm not sure there's a justification for `FileDescriptor` to be distinct from
`FileMetaData`.
* I'm skeptical that `FdWithKeyRange` really still makes sense for
optimizing some data locality by duplicating some data in memory, but I
could be wrong.
* An unnecessary overload of NewTableReader was recently added, in
the public API nonetheless (though unusable there). It should be cleaned
up to put most things under `TableReaderOptions`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10532
Test Plan:
updated unit tests
Performance test showing no significant difference (just noise I think):
`./db_bench -benchmarks=readwhilewriting[-X10] -num=3000000 -disable_wal=1 -bloom_bits=8 -write_buffer_size=1000000 -target_file_size_base=1000000`
Before: readwhilewriting [AVG 10 runs] : 68702 (± 6932) ops/sec
After: readwhilewriting [AVG 10 runs] : 68239 (± 7198) ops/sec
Reviewed By: jay-zhuang
Differential Revision: D38765551
Pulled By: pdillinger
fbshipit-source-id: a827a708155f12344ab2a5c16e7701c7636da4c2
2022-09-08 05:52:42 +00:00
|
|
|
uint64_t file_num_;
|
2018-11-09 19:17:34 +00:00
|
|
|
std::unique_ptr<WritableFileWriter> file_writer_;
|
|
|
|
std::unique_ptr<RandomAccessFileReader> file_reader_;
|
|
|
|
std::unique_ptr<TableReader> table_reader_;
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-25 03:50:35 +00:00
|
|
|
SequenceNumber largest_seqno_;
|
2014-09-05 00:40:41 +00:00
|
|
|
bool convert_to_internal_key_;
|
2018-04-09 23:17:15 +00:00
|
|
|
int level_;
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2023-12-01 19:15:17 +00:00
|
|
|
TableConstructor() = delete;
|
2013-01-31 23:20:24 +00:00
|
|
|
|
Always verify SST unique IDs on SST file open (#10532)
Summary:
Although we've been tracking SST unique IDs in the DB manifest
unconditionally, checking has been opt-in and with an extra pass at DB::Open
time. This changes the behavior of `verify_sst_unique_id_in_manifest` to
check unique ID against manifest every time an SST file is opened through
table cache (normal DB operations), replacing the explicit pass over files
at DB::Open time. This change also enables the option by default and
removes the "EXPERIMENTAL" designation.
One possible criticism is that the option no longer ensures the integrity
of a DB at Open time. This is far from an all-or-nothing issue. Verifying
the IDs of all SST files hardly ensures all the data in the DB is readable.
(VerifyChecksum is supposed to do that.) Also, with
max_open_files=-1 (default, extremely common), all SST files are
opened at DB::Open time anyway.
Implementation details:
* `VerifySstUniqueIdInManifest()` functions are the extra/explicit pass
that is now removed.
* Unit tests that manipulate/corrupt table properties have to opt out of
this check, because that corrupts the "actual" unique id. (And even for
testing we don't currently have a mechanism to set "no unique id"
in the in-memory file metadata for new files.)
* A lot of other unit test churn relates to (a) default checking on, and
(b) checking on SST open even without DB::Open (e.g. on flush)
* Use `FileMetaData` for more `TableCache` operations (in place of
`FileDescriptor`) so that we have access to the unique_id whenever
we might need to open an SST file. **There is the possibility of
performance impact because we can no longer use the more
localized `fd` part of an `FdWithKeyRange` but instead follow the
`file_metadata` pointer. However, this change (possible regression)
is only done for `GetMemoryUsageByTableReaders`.**
* Removed a completely unnecessary constructor overload of
`TableReaderOptions`
Possible follow-up:
* Verification only happens when opening through table cache. Are there
more places where this should happen?
* Improve error message when there is a file size mismatch vs. manifest
(FIXME added in the appropriate place).
* I'm not sure there's a justification for `FileDescriptor` to be distinct from
`FileMetaData`.
* I'm skeptical that `FdWithKeyRange` really still makes sense for
optimizing some data locality by duplicating some data in memory, but I
could be wrong.
* An unnecessary overload of NewTableReader was recently added, in
the public API nonetheless (though unusable there). It should be cleaned
up to put most things under `TableReaderOptions`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10532
Test Plan:
updated unit tests
Performance test showing no significant difference (just noise I think):
`./db_bench -benchmarks=readwhilewriting[-X10] -num=3000000 -disable_wal=1 -bloom_bits=8 -write_buffer_size=1000000 -target_file_size_base=1000000`
Before: readwhilewriting [AVG 10 runs] : 68702 (± 6932) ops/sec
After: readwhilewriting [AVG 10 runs] : 68239 (± 7198) ops/sec
Reviewed By: jay-zhuang
Differential Revision: D38765551
Pulled By: pdillinger
fbshipit-source-id: a827a708155f12344ab2a5c16e7701c7636da4c2
2022-09-08 05:52:42 +00:00
|
|
|
static uint64_t cur_file_num_;
|
2015-09-16 23:57:43 +00:00
|
|
|
EnvOptions soptions;
|
2011-03-18 22:37:00 +00:00
|
|
|
};
|
Always verify SST unique IDs on SST file open (#10532)
Summary:
Although we've been tracking SST unique IDs in the DB manifest
unconditionally, checking has been opt-in and with an extra pass at DB::Open
time. This changes the behavior of `verify_sst_unique_id_in_manifest` to
check unique ID against manifest every time an SST file is opened through
table cache (normal DB operations), replacing the explicit pass over files
at DB::Open time. This change also enables the option by default and
removes the "EXPERIMENTAL" designation.
One possible criticism is that the option no longer ensures the integrity
of a DB at Open time. This is far from an all-or-nothing issue. Verifying
the IDs of all SST files hardly ensures all the data in the DB is readable.
(VerifyChecksum is supposed to do that.) Also, with
max_open_files=-1 (default, extremely common), all SST files are
opened at DB::Open time anyway.
Implementation details:
* `VerifySstUniqueIdInManifest()` functions are the extra/explicit pass
that is now removed.
* Unit tests that manipulate/corrupt table properties have to opt out of
this check, because that corrupts the "actual" unique id. (And even for
testing we don't currently have a mechanism to set "no unique id"
in the in-memory file metadata for new files.)
* A lot of other unit test churn relates to (a) default checking on, and
(b) checking on SST open even without DB::Open (e.g. on flush)
* Use `FileMetaData` for more `TableCache` operations (in place of
`FileDescriptor`) so that we have access to the unique_id whenever
we might need to open an SST file. **There is the possibility of
performance impact because we can no longer use the more
localized `fd` part of an `FdWithKeyRange` but instead follow the
`file_metadata` pointer. However, this change (possible regression)
is only done for `GetMemoryUsageByTableReaders`.**
* Removed a completely unnecessary constructor overload of
`TableReaderOptions`
Possible follow-up:
* Verification only happens when opening through table cache. Are there
more places where this should happen?
* Improve error message when there is a file size mismatch vs. manifest
(FIXME added in the appropriate place).
* I'm not sure there's a justification for `FileDescriptor` to be distinct from
`FileMetaData`.
* I'm skeptical that `FdWithKeyRange` really still makes sense for
optimizing some data locality by duplicating some data in memory, but I
could be wrong.
* An unnecessary overload of NewTableReader was recently added, in
the public API nonetheless (though unusable there). It should be cleaned
up to put most things under `TableReaderOptions`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10532
Test Plan:
updated unit tests
Performance test showing no significant difference (just noise I think):
`./db_bench -benchmarks=readwhilewriting[-X10] -num=3000000 -disable_wal=1 -bloom_bits=8 -write_buffer_size=1000000 -target_file_size_base=1000000`
Before: readwhilewriting [AVG 10 runs] : 68702 (± 6932) ops/sec
After: readwhilewriting [AVG 10 runs] : 68239 (± 7198) ops/sec
Reviewed By: jay-zhuang
Differential Revision: D38765551
Pulled By: pdillinger
fbshipit-source-id: a827a708155f12344ab2a5c16e7701c7636da4c2
2022-09-08 05:52:42 +00:00
|
|
|
uint64_t TableConstructor::cur_file_num_ = 1;
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2022-10-25 18:50:38 +00:00
|
|
|
class MemTableConstructor : public Constructor {
|
2011-03-18 22:37:00 +00:00
|
|
|
public:
|
2016-06-21 01:01:03 +00:00
|
|
|
explicit MemTableConstructor(const Comparator* cmp, WriteBufferManager* wb)
|
2011-03-18 22:37:00 +00:00
|
|
|
: Constructor(cmp),
|
2013-07-23 21:42:27 +00:00
|
|
|
internal_comparator_(cmp),
|
2016-06-21 01:01:03 +00:00
|
|
|
write_buffer_manager_(wb),
|
2013-07-23 21:42:27 +00:00
|
|
|
table_factory_(new SkipListFactory) {
|
2014-12-02 20:09:20 +00:00
|
|
|
options_.memtable_factory = table_factory_;
|
2021-05-05 20:59:21 +00:00
|
|
|
ImmutableOptions ioptions(options_);
|
2016-09-14 04:11:59 +00:00
|
|
|
memtable_ =
|
|
|
|
new MemTable(internal_comparator_, ioptions, MutableCFOptions(options_),
|
2017-06-02 19:08:01 +00:00
|
|
|
wb, kMaxSequenceNumber, 0 /* column_family_id */);
|
2011-05-21 02:17:43 +00:00
|
|
|
memtable_->Ref();
|
2011-03-18 22:37:00 +00:00
|
|
|
}
|
2019-02-14 21:52:47 +00:00
|
|
|
~MemTableConstructor() override { delete memtable_->Unref(); }
|
2021-05-05 20:59:21 +00:00
|
|
|
Status FinishImpl(const Options&, const ImmutableOptions& ioptions,
|
2019-02-14 21:52:47 +00:00
|
|
|
const MutableCFOptions& /*moptions*/,
|
|
|
|
const BlockBasedTableOptions& /*table_options*/,
|
|
|
|
const InternalKeyComparator& /*internal_comparator*/,
|
|
|
|
const stl_wrappers::KVMap& kv_map) override {
|
2013-12-02 05:23:44 +00:00
|
|
|
delete memtable_->Unref();
|
2021-05-05 20:59:21 +00:00
|
|
|
ImmutableOptions mem_ioptions(ioptions);
|
2014-10-01 23:19:16 +00:00
|
|
|
memtable_ = new MemTable(internal_comparator_, mem_ioptions,
|
2016-09-14 04:11:59 +00:00
|
|
|
MutableCFOptions(options_), write_buffer_manager_,
|
2017-06-02 19:08:01 +00:00
|
|
|
kMaxSequenceNumber, 0 /* column_family_id */);
|
2011-05-21 02:17:43 +00:00
|
|
|
memtable_->Ref();
|
2011-03-18 22:37:00 +00:00
|
|
|
int seq = 1;
|
2020-06-24 23:20:55 +00:00
|
|
|
for (const auto& kv : kv_map) {
|
Integrity protection for live updates to WriteBatch (#7748)
Summary:
This PR adds the foundation classes for key-value integrity protection and the first use case: protecting live updates from the source buffers added to `WriteBatch` through the destination buffer in `MemTable`. The width of the protection info is not yet configurable -- only eight bytes per key is supported. This PR allows users to enable protection by constructing `WriteBatch` with `protection_bytes_per_key == 8`. It does not yet expose a way for users to get integrity protection via other write APIs (e.g., `Put()`, `Merge()`, `Delete()`, etc.).
The foundation classes (`ProtectionInfo.*`) embed the coverage info in their type, and provide `Protect.*()` and `Strip.*()` functions to navigate between types with different coverage. For making bytes per key configurable (for powers of two up to eight) in the future, these classes are templated on the unsigned integer type used to store the protection info. That integer contains the XOR'd result of hashes with independent seeds for all covered fields. For integer fields, the hash is computed on the raw unadjusted bytes, so the result is endian-dependent. The most significant bytes are truncated when the hash value (8 bytes) is wider than the protection integer.
When `WriteBatch` is constructed with `protection_bytes_per_key == 8`, we hold a `ProtectionInfoKVOTC` (i.e., one that covers key, value, optype aka `ValueType`, timestamp, and CF ID) for each entry added to the batch. The protection info is generated from the original buffers passed by the user, as well as the original metadata generated internally. When writing to memtable, each entry is transformed to a `ProtectionInfoKVOTS` (i.e., dropping coverage of CF ID and adding coverage of sequence number), since at that point we know the sequence number, and have already selected a memtable corresponding to a particular CF. This protection info is verified once the entry is encoded in the `MemTable` buffer.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7748
Test Plan:
- an integration test to verify a wide variety of single-byte changes to the encoded `MemTable` buffer are caught
- add to stress/crash test to verify it works in variety of configs/operations without intentional corruption
- [deferred] unit tests for `ProtectionInfo.*` classes for edge cases like KV swap, `SliceParts` and `Slice` APIs are interchangeable, etc.
Reviewed By: pdillinger
Differential Revision: D25754492
Pulled By: ajkr
fbshipit-source-id: e481bac6c03c2ab268be41359730f1ceb9964866
2021-01-29 20:17:17 +00:00
|
|
|
Status s = memtable_->Add(seq, kTypeValue, kv.first, kv.second,
|
|
|
|
nullptr /* kv_prot_info */);
|
2020-11-24 00:27:46 +00:00
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
2011-03-18 22:37:00 +00:00
|
|
|
seq++;
|
|
|
|
}
|
|
|
|
return Status::OK();
|
|
|
|
}
|
2019-02-14 21:52:47 +00:00
|
|
|
InternalIterator* NewIterator(
|
2018-05-21 21:33:55 +00:00
|
|
|
const SliceTransform* /*prefix_extractor*/) const override {
|
2014-09-05 00:40:41 +00:00
|
|
|
return new KeyConvertingIterator(
|
Support returning write unix time in iterator property (#12428)
Summary:
This PR adds support to return data's approximate unix write time in the iterator property API. The general implementation is:
1) If the entry comes from a SST file, the sequence number to time mapping recorded in that file's table properties will be used to deduce the entry's write time from its sequence number. If no such recording is available, `std::numeric_limits<uint64_t>::max()` is returned to indicate the write time is unknown except if the entry's sequence number is zero, in which case, 0 is returned. This also means that even if `preclude_last_level_data_seconds` and `preserve_internal_time_seconds` can be toggled off between DB reopens, as long as the SST file's table property has the mapping available, the entry's write time can be deduced and returned.
2) If the entry comes from memtable, we will use the DB's sequence number to write time mapping to do similar things. A copy of the DB's seqno to write time mapping is kept in SuperVersion to allow iterators to have lock free access. This also means a new `SuperVersion` is installed each time DB's seqno to time mapping updates, which is originally proposed by Peter in https://github.com/facebook/rocksdb/issues/11928 . Similarly, if the feature is not enabled, `std::numeric_limits<uint64_t>::max()` is returned to indicate the write time is unknown.
Needed follow up:
1) The write time for `kTypeValuePreferredSeqno` should be special cased, where it's already specified by the user, so we can directly return it.
2) Flush job can be updated to use DB's seqno to time mapping copy in the SuperVersion.
3) Handle the case when `TimedPut` is called with a write time that is `std::numeric_limits<uint64_t>::max()`. We can make it a regular `Put`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12428
Test Plan: Added unit test
Reviewed By: pdillinger
Differential Revision: D54967067
Pulled By: jowlyzhang
fbshipit-source-id: c795b1b7ec142e09e53f2ed3461cf719833cb37a
2024-03-15 22:37:37 +00:00
|
|
|
memtable_->NewIterator(ReadOptions(), /*seqno_to_time_mapping=*/nullptr,
|
Steps toward deprecating implicit prefix seek, related fixes (#13026)
Summary:
With some new use cases onboarding to prefix extractors/seek/filters, one of the risks is existing iterator code, e.g. for maintenance tasks, being unintentionally subject to prefix seek semantics. This is a longstanding known design flaw with prefix seek, and `prefix_same_as_start` and `auto_prefix_mode` were steps in the direction of making that obsolete. However, we can't just immediately set `total_order_seek` to true by default, because that would impact so much code instantly.
Here we add a new DB option, `prefix_seek_opt_in_only` that basically allows users to transition to the future behavior when they are ready. When set to true, all iterators will be treated as if `total_order_seek=true` and then the only ways to get prefix seek semantics are with `prefix_same_as_start` or `auto_prefix_mode`.
Related fixes / changes:
* Make sure that `prefix_same_as_start` and `auto_prefix_mode` are compatible with (or override) `total_order_seek` (depending on your interpretation).
* Fix a bug in which a new iterator after dynamically changing the prefix extractor might mix different prefix semantics between memtable and SSTs. Both should use the latest extractor semantics, which means iterators ignoring memtable prefix filters with an old extractor. And that means passing the latest prefix extractor to new memtable iterators that might use prefix seek. (Without the fix, the test added for this fails in many ways.)
Suggested follow-up:
* Investigate a FIXME where a MergeIteratorBuilder is created in db_impl.cc. No unit test detects a change in value that should impact correctness.
* Make memtable prefix bloom compatible with `auto_prefix_mode`, which might require involving the memtablereps because we don't know at iterator creation time (only seek time) whether an auto_prefix_mode seek will be a prefix seek.
* Add `prefix_same_as_start` testing to db_stress
Pull Request resolved: https://github.com/facebook/rocksdb/pull/13026
Test Plan:
tests updated, added. Add combination of `total_order_seek=true` and `auto_prefix_mode=true` to stress test. Ran `make blackbox_crash_test` for a long while.
Manually ran tests with `prefix_seek_opt_in_only=true` as default, looking for unexpected issues. I inspected most of the results and migrated many tests to be ready for such a change (but not all).
Reviewed By: ltamasi
Differential Revision: D63147378
Pulled By: pdillinger
fbshipit-source-id: 1f4477b730683d43b4be7e933338583702d3c25e
2024-09-20 22:54:19 +00:00
|
|
|
&arena_, /*prefix_extractor=*/nullptr),
|
Support returning write unix time in iterator property (#12428)
Summary:
This PR adds support to return data's approximate unix write time in the iterator property API. The general implementation is:
1) If the entry comes from a SST file, the sequence number to time mapping recorded in that file's table properties will be used to deduce the entry's write time from its sequence number. If no such recording is available, `std::numeric_limits<uint64_t>::max()` is returned to indicate the write time is unknown except if the entry's sequence number is zero, in which case, 0 is returned. This also means that even if `preclude_last_level_data_seconds` and `preserve_internal_time_seconds` can be toggled off between DB reopens, as long as the SST file's table property has the mapping available, the entry's write time can be deduced and returned.
2) If the entry comes from memtable, we will use the DB's sequence number to write time mapping to do similar things. A copy of the DB's seqno to write time mapping is kept in SuperVersion to allow iterators to have lock free access. This also means a new `SuperVersion` is installed each time DB's seqno to time mapping updates, which is originally proposed by Peter in https://github.com/facebook/rocksdb/issues/11928 . Similarly, if the feature is not enabled, `std::numeric_limits<uint64_t>::max()` is returned to indicate the write time is unknown.
Needed follow up:
1) The write time for `kTypeValuePreferredSeqno` should be special cased, where it's already specified by the user, so we can directly return it.
2) Flush job can be updated to use DB's seqno to time mapping copy in the SuperVersion.
3) Handle the case when `TimedPut` is called with a write time that is `std::numeric_limits<uint64_t>::max()`. We can make it a regular `Put`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12428
Test Plan: Added unit test
Reviewed By: pdillinger
Differential Revision: D54967067
Pulled By: jowlyzhang
fbshipit-source-id: c795b1b7ec142e09e53f2ed3461cf719833cb37a
2024-03-15 22:37:37 +00:00
|
|
|
true);
|
2011-03-18 22:37:00 +00:00
|
|
|
}
|
|
|
|
|
2019-02-14 21:52:47 +00:00
|
|
|
bool AnywayDeleteIterator() const override { return true; }
|
2014-09-05 00:40:41 +00:00
|
|
|
|
2019-02-14 21:52:47 +00:00
|
|
|
bool IsArenaMode() const override { return true; }
|
2014-09-05 00:40:41 +00:00
|
|
|
|
2011-03-18 22:37:00 +00:00
|
|
|
private:
|
2014-09-05 00:40:41 +00:00
|
|
|
mutable Arena arena_;
|
2011-03-18 22:37:00 +00:00
|
|
|
InternalKeyComparator internal_comparator_;
|
2014-12-02 20:09:20 +00:00
|
|
|
Options options_;
|
2016-06-21 01:01:03 +00:00
|
|
|
WriteBufferManager* write_buffer_manager_;
|
2011-03-18 22:37:00 +00:00
|
|
|
MemTable* memtable_;
|
2013-07-23 21:42:27 +00:00
|
|
|
std::shared_ptr<SkipListFactory> table_factory_;
|
2011-03-18 22:37:00 +00:00
|
|
|
};
|
|
|
|
|
2015-10-12 22:06:38 +00:00
|
|
|
class InternalIteratorFromIterator : public InternalIterator {
|
|
|
|
public:
|
|
|
|
explicit InternalIteratorFromIterator(Iterator* it) : it_(it) {}
|
2019-02-14 21:52:47 +00:00
|
|
|
bool Valid() const override { return it_->Valid(); }
|
|
|
|
void Seek(const Slice& target) override { it_->Seek(target); }
|
|
|
|
void SeekForPrev(const Slice& target) override { it_->SeekForPrev(target); }
|
|
|
|
void SeekToFirst() override { it_->SeekToFirst(); }
|
|
|
|
void SeekToLast() override { it_->SeekToLast(); }
|
|
|
|
void Next() override { it_->Next(); }
|
|
|
|
void Prev() override { it_->Prev(); }
|
2015-10-12 22:06:38 +00:00
|
|
|
Slice key() const override { return it_->key(); }
|
|
|
|
Slice value() const override { return it_->value(); }
|
2019-02-14 21:52:47 +00:00
|
|
|
Status status() const override { return it_->status(); }
|
2015-10-12 22:06:38 +00:00
|
|
|
|
|
|
|
private:
|
2018-11-09 19:17:34 +00:00
|
|
|
std::unique_ptr<Iterator> it_;
|
2015-10-12 22:06:38 +00:00
|
|
|
};
|
|
|
|
|
2022-10-25 18:50:38 +00:00
|
|
|
class DBConstructor : public Constructor {
|
2011-03-18 22:37:00 +00:00
|
|
|
public:
|
|
|
|
explicit DBConstructor(const Comparator* cmp)
|
2022-10-25 18:50:38 +00:00
|
|
|
: Constructor(cmp), comparator_(cmp) {
|
2013-03-01 02:04:58 +00:00
|
|
|
db_ = nullptr;
|
2011-03-18 22:37:00 +00:00
|
|
|
NewDB();
|
|
|
|
}
|
2019-02-14 21:52:47 +00:00
|
|
|
~DBConstructor() override { delete db_; }
|
|
|
|
Status FinishImpl(const Options& /*options*/,
|
2021-05-05 20:59:21 +00:00
|
|
|
const ImmutableOptions& /*ioptions*/,
|
2019-02-14 21:52:47 +00:00
|
|
|
const MutableCFOptions& /*moptions*/,
|
|
|
|
const BlockBasedTableOptions& /*table_options*/,
|
|
|
|
const InternalKeyComparator& /*internal_comparator*/,
|
|
|
|
const stl_wrappers::KVMap& kv_map) override {
|
2011-03-18 22:37:00 +00:00
|
|
|
delete db_;
|
2013-03-01 02:04:58 +00:00
|
|
|
db_ = nullptr;
|
2011-03-18 22:37:00 +00:00
|
|
|
NewDB();
|
2020-06-24 23:20:55 +00:00
|
|
|
for (const auto& kv : kv_map) {
|
2011-03-18 22:37:00 +00:00
|
|
|
WriteBatch batch;
|
2020-10-08 18:11:52 +00:00
|
|
|
EXPECT_OK(batch.Put(kv.first, kv.second));
|
rocksdb: Replace ASSERT* with EXPECT* in functions that does not return void value
Summary:
gtest does not use exceptions to fail a unit test by design, and `ASSERT*`s are implemented using `return`. As a consequence we cannot use `ASSERT*` in a function that does not return `void` value ([[ https://code.google.com/p/googletest/wiki/AdvancedGuide#Assertion_Placement | 1]]), and have to fix our existing code. This diff does this in a generic way, with no manual changes.
In order to detect all existing `ASSERT*` that are used in functions that doesn't return void value, I change the code to generate compile errors for such cases.
In `util/testharness.h` I defined `EXPECT*` assertions, the same way as `ASSERT*`, and redefined `ASSERT*` to return `void`. Then executed:
```lang=bash
% USE_CLANG=1 make all -j55 -k 2> build.log
% perl -naF: -e 'print "-- -number=".$F[1]." ".$F[0]."\n" if /: error:/' \
build.log | xargs -L 1 perl -spi -e 's/ASSERT/EXPECT/g if $. == $number'
% make format
```
After that I reverted back change to `ASSERT*` in `util/testharness.h`. But preserved introduced `EXPECT*`, which is the same as `ASSERT*`. This will be deleted once switched to gtest.
This diff is independent and contains manual changes only in `util/testharness.h`.
Test Plan:
Make sure all tests are passing.
```lang=bash
% USE_CLANG=1 make check
```
Reviewers: igor, lgalanis, sdong, yufei.zhu, rven, meyering
Reviewed By: meyering
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D33333
2015-03-17 03:52:32 +00:00
|
|
|
EXPECT_TRUE(db_->Write(WriteOptions(), &batch).ok());
|
2011-03-18 22:37:00 +00:00
|
|
|
}
|
|
|
|
return Status::OK();
|
|
|
|
}
|
2015-10-12 22:06:38 +00:00
|
|
|
|
2019-02-14 21:52:47 +00:00
|
|
|
InternalIterator* NewIterator(
|
2018-05-21 21:33:55 +00:00
|
|
|
const SliceTransform* /*prefix_extractor*/) const override {
|
2015-10-12 22:06:38 +00:00
|
|
|
return new InternalIteratorFromIterator(db_->NewIterator(ReadOptions()));
|
2011-03-18 22:37:00 +00:00
|
|
|
}
|
|
|
|
|
2019-02-14 21:52:47 +00:00
|
|
|
DB* db() const override { return db_; }
|
2011-03-21 19:40:57 +00:00
|
|
|
|
2011-03-18 22:37:00 +00:00
|
|
|
private:
|
|
|
|
void NewDB() {
|
2018-07-14 00:18:39 +00:00
|
|
|
std::string name = test::PerThreadDBPath("table_testdb");
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2013-11-20 06:00:48 +00:00
|
|
|
Options options;
|
2011-03-18 22:37:00 +00:00
|
|
|
options.comparator = comparator_;
|
|
|
|
Status status = DestroyDB(name, options);
|
|
|
|
ASSERT_TRUE(status.ok()) << status.ToString();
|
|
|
|
|
|
|
|
options.create_if_missing = true;
|
|
|
|
options.error_if_exists = true;
|
2011-03-21 19:40:57 +00:00
|
|
|
options.write_buffer_size = 10000; // Something small to force merging
|
2011-03-18 22:37:00 +00:00
|
|
|
status = DB::Open(options, name, &db_);
|
|
|
|
ASSERT_TRUE(status.ok()) << status.ToString();
|
|
|
|
}
|
|
|
|
|
|
|
|
const Comparator* comparator_;
|
|
|
|
DB* db_;
|
|
|
|
};
|
|
|
|
|
|
|
|
enum TestType {
|
2013-12-20 17:35:24 +00:00
|
|
|
BLOCK_BASED_TABLE_TEST,
|
|
|
|
PLAIN_TABLE_SEMI_FIXED_PREFIX,
|
|
|
|
PLAIN_TABLE_FULL_STR_PREFIX,
|
2014-02-08 00:25:38 +00:00
|
|
|
PLAIN_TABLE_TOTAL_ORDER,
|
2011-03-18 22:37:00 +00:00
|
|
|
BLOCK_TEST,
|
|
|
|
MEMTABLE_TEST,
|
2011-07-19 23:36:47 +00:00
|
|
|
DB_TEST
|
2011-03-18 22:37:00 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
struct TestArgs {
|
|
|
|
TestType type;
|
|
|
|
bool reverse_compare;
|
|
|
|
int restart_interval;
|
2012-06-28 06:41:33 +00:00
|
|
|
CompressionType compression;
|
2020-04-01 23:37:54 +00:00
|
|
|
uint32_t compression_parallel_threads;
|
2015-01-15 00:24:24 +00:00
|
|
|
uint32_t format_version;
|
2015-09-16 23:57:43 +00:00
|
|
|
bool use_mmap;
|
2011-03-18 22:37:00 +00:00
|
|
|
};
|
|
|
|
|
2020-06-15 21:06:47 +00:00
|
|
|
std::ostream& operator<<(std::ostream& os, const TestArgs& args) {
|
|
|
|
os << "type: " << args.type << " reverse_compare: " << args.reverse_compare
|
|
|
|
<< " restart_interval: " << args.restart_interval
|
|
|
|
<< " compression: " << args.compression
|
|
|
|
<< " compression_parallel_threads: " << args.compression_parallel_threads
|
|
|
|
<< " format_version: " << args.format_version
|
|
|
|
<< " use_mmap: " << args.use_mmap;
|
|
|
|
|
|
|
|
return os;
|
|
|
|
}
|
|
|
|
|
2013-11-20 06:00:48 +00:00
|
|
|
static std::vector<TestArgs> GenerateArgList() {
|
2014-01-24 19:09:04 +00:00
|
|
|
std::vector<TestArgs> test_args;
|
2022-10-25 18:50:38 +00:00
|
|
|
std::vector<TestType> test_types = {BLOCK_BASED_TABLE_TEST,
|
|
|
|
PLAIN_TABLE_SEMI_FIXED_PREFIX,
|
|
|
|
PLAIN_TABLE_FULL_STR_PREFIX,
|
|
|
|
PLAIN_TABLE_TOTAL_ORDER,
|
|
|
|
BLOCK_TEST,
|
|
|
|
MEMTABLE_TEST,
|
|
|
|
DB_TEST};
|
2014-01-24 19:09:04 +00:00
|
|
|
std::vector<bool> reverse_compare_types = {false, true};
|
|
|
|
std::vector<int> restart_intervals = {16, 1, 1024};
|
2020-04-01 23:37:54 +00:00
|
|
|
std::vector<uint32_t> compression_parallel_threads = {1, 4};
|
2012-06-28 06:41:33 +00:00
|
|
|
|
|
|
|
// Only add compression if it is supported
|
2015-01-15 00:24:24 +00:00
|
|
|
std::vector<std::pair<CompressionType, bool>> compression_types;
|
|
|
|
compression_types.emplace_back(kNoCompression, false);
|
2015-04-06 19:50:44 +00:00
|
|
|
if (Snappy_Supported()) {
|
2015-01-15 00:24:24 +00:00
|
|
|
compression_types.emplace_back(kSnappyCompression, false);
|
2014-01-24 19:09:04 +00:00
|
|
|
}
|
2015-04-06 19:50:44 +00:00
|
|
|
if (Zlib_Supported()) {
|
2015-01-15 00:24:24 +00:00
|
|
|
compression_types.emplace_back(kZlibCompression, false);
|
|
|
|
compression_types.emplace_back(kZlibCompression, true);
|
2014-01-24 19:09:04 +00:00
|
|
|
}
|
2015-04-06 19:50:44 +00:00
|
|
|
if (BZip2_Supported()) {
|
2015-01-15 00:24:24 +00:00
|
|
|
compression_types.emplace_back(kBZip2Compression, false);
|
|
|
|
compression_types.emplace_back(kBZip2Compression, true);
|
2014-01-24 19:09:04 +00:00
|
|
|
}
|
2015-04-06 19:50:44 +00:00
|
|
|
if (LZ4_Supported()) {
|
2015-01-15 00:24:24 +00:00
|
|
|
compression_types.emplace_back(kLZ4Compression, false);
|
|
|
|
compression_types.emplace_back(kLZ4Compression, true);
|
|
|
|
compression_types.emplace_back(kLZ4HCCompression, false);
|
|
|
|
compression_types.emplace_back(kLZ4HCCompression, true);
|
2014-02-08 02:12:30 +00:00
|
|
|
}
|
2016-04-20 05:54:24 +00:00
|
|
|
if (XPRESS_Supported()) {
|
|
|
|
compression_types.emplace_back(kXpressCompression, false);
|
|
|
|
compression_types.emplace_back(kXpressCompression, true);
|
|
|
|
}
|
2015-08-27 22:40:42 +00:00
|
|
|
if (ZSTD_Supported()) {
|
2016-09-01 22:28:40 +00:00
|
|
|
compression_types.emplace_back(kZSTD, false);
|
|
|
|
compression_types.emplace_back(kZSTD, true);
|
2015-08-27 22:40:42 +00:00
|
|
|
}
|
2012-06-29 02:26:43 +00:00
|
|
|
|
2014-01-24 19:09:04 +00:00
|
|
|
for (auto test_type : test_types) {
|
|
|
|
for (auto reverse_compare : reverse_compare_types) {
|
|
|
|
if (test_type == PLAIN_TABLE_SEMI_FIXED_PREFIX ||
|
2015-09-16 23:57:43 +00:00
|
|
|
test_type == PLAIN_TABLE_FULL_STR_PREFIX ||
|
|
|
|
test_type == PLAIN_TABLE_TOTAL_ORDER) {
|
2013-12-20 17:35:24 +00:00
|
|
|
// Plain table doesn't use restart index or compression.
|
|
|
|
TestArgs one_arg;
|
2014-01-24 19:09:04 +00:00
|
|
|
one_arg.type = test_type;
|
|
|
|
one_arg.reverse_compare = reverse_compare;
|
|
|
|
one_arg.restart_interval = restart_intervals[0];
|
2015-01-15 00:24:24 +00:00
|
|
|
one_arg.compression = compression_types[0].first;
|
2020-04-01 23:37:54 +00:00
|
|
|
one_arg.compression_parallel_threads = 1;
|
2020-06-15 21:06:47 +00:00
|
|
|
one_arg.format_version = 0;
|
2015-09-16 23:57:43 +00:00
|
|
|
one_arg.use_mmap = true;
|
|
|
|
test_args.push_back(one_arg);
|
|
|
|
one_arg.use_mmap = false;
|
2014-01-24 19:09:04 +00:00
|
|
|
test_args.push_back(one_arg);
|
2013-12-20 17:35:24 +00:00
|
|
|
continue;
|
|
|
|
}
|
2012-06-28 06:41:33 +00:00
|
|
|
|
2014-01-24 19:09:04 +00:00
|
|
|
for (auto restart_interval : restart_intervals) {
|
|
|
|
for (auto compression_type : compression_types) {
|
2020-04-01 23:37:54 +00:00
|
|
|
for (auto num_threads : compression_parallel_threads) {
|
|
|
|
TestArgs one_arg;
|
|
|
|
one_arg.type = test_type;
|
|
|
|
one_arg.reverse_compare = reverse_compare;
|
|
|
|
one_arg.restart_interval = restart_interval;
|
|
|
|
one_arg.compression = compression_type.first;
|
|
|
|
one_arg.compression_parallel_threads = num_threads;
|
2020-06-15 21:06:47 +00:00
|
|
|
one_arg.format_version = compression_type.second ? 2 : 1;
|
2020-04-01 23:37:54 +00:00
|
|
|
one_arg.use_mmap = false;
|
|
|
|
test_args.push_back(one_arg);
|
|
|
|
}
|
2013-12-20 17:35:24 +00:00
|
|
|
}
|
2014-01-24 19:09:04 +00:00
|
|
|
}
|
2013-12-20 17:35:24 +00:00
|
|
|
}
|
2014-01-24 19:09:04 +00:00
|
|
|
}
|
|
|
|
return test_args;
|
2012-06-28 06:41:33 +00:00
|
|
|
}
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2013-12-20 17:35:24 +00:00
|
|
|
// In order to make all tests run for plain table format, including
|
|
|
|
// those operating on empty keys, create a new prefix transformer which
|
|
|
|
// return fixed prefix if the slice is not shorter than the prefix length,
|
|
|
|
// and the full slice if it is shorter.
|
|
|
|
class FixedOrLessPrefixTransform : public SliceTransform {
|
|
|
|
private:
|
|
|
|
const size_t prefix_len_;
|
|
|
|
|
|
|
|
public:
|
2022-10-25 18:50:38 +00:00
|
|
|
explicit FixedOrLessPrefixTransform(size_t prefix_len)
|
|
|
|
: prefix_len_(prefix_len) {}
|
2013-12-20 17:35:24 +00:00
|
|
|
|
2019-02-14 21:52:47 +00:00
|
|
|
const char* Name() const override { return "rocksdb.FixedPrefix"; }
|
2013-12-20 17:35:24 +00:00
|
|
|
|
2019-02-14 21:52:47 +00:00
|
|
|
Slice Transform(const Slice& src) const override {
|
2013-12-20 17:35:24 +00:00
|
|
|
assert(InDomain(src));
|
|
|
|
if (src.size() < prefix_len_) {
|
|
|
|
return src;
|
|
|
|
}
|
|
|
|
return Slice(src.data(), prefix_len_);
|
|
|
|
}
|
|
|
|
|
2019-02-14 21:52:47 +00:00
|
|
|
bool InDomain(const Slice& /*src*/) const override { return true; }
|
2013-12-20 17:35:24 +00:00
|
|
|
|
2019-02-14 21:52:47 +00:00
|
|
|
bool InRange(const Slice& dst) const override {
|
2013-12-20 17:35:24 +00:00
|
|
|
return (dst.size() <= prefix_len_);
|
|
|
|
}
|
2019-02-14 21:52:47 +00:00
|
|
|
bool FullLengthEnabled(size_t* /*len*/) const override { return false; }
|
2013-12-20 17:35:24 +00:00
|
|
|
};
|
|
|
|
|
2015-03-17 21:08:00 +00:00
|
|
|
class HarnessTest : public testing::Test {
|
2011-03-18 22:37:00 +00:00
|
|
|
public:
|
2020-06-13 00:29:22 +00:00
|
|
|
explicit HarnessTest(const TestArgs& args)
|
|
|
|
: args_(args),
|
|
|
|
ioptions_(options_),
|
2018-05-21 21:33:55 +00:00
|
|
|
moptions_(options_),
|
2020-06-13 00:29:22 +00:00
|
|
|
write_buffer_(options_.db_write_buffer_size),
|
|
|
|
support_prev_(true),
|
|
|
|
only_support_prefix_seek_(false) {
|
|
|
|
options_.compression = args_.compression;
|
2020-04-01 23:37:54 +00:00
|
|
|
options_.compression_opts.parallel_threads =
|
2020-06-13 00:29:22 +00:00
|
|
|
args_.compression_parallel_threads;
|
2011-03-18 22:37:00 +00:00
|
|
|
// Use shorter block size for tests to exercise block boundary
|
|
|
|
// conditions more.
|
2020-06-13 00:29:22 +00:00
|
|
|
if (args_.reverse_compare) {
|
2011-03-18 22:37:00 +00:00
|
|
|
options_.comparator = &reverse_key_comparator;
|
|
|
|
}
|
2014-01-27 21:53:22 +00:00
|
|
|
|
|
|
|
internal_comparator_.reset(
|
|
|
|
new test::PlainInternalKeyComparator(options_.comparator));
|
|
|
|
|
2020-06-13 00:29:22 +00:00
|
|
|
options_.allow_mmap_reads = args_.use_mmap;
|
|
|
|
switch (args_.type) {
|
2013-12-20 17:35:24 +00:00
|
|
|
case BLOCK_BASED_TABLE_TEST:
|
2014-08-25 21:22:05 +00:00
|
|
|
table_options_.flush_block_policy_factory.reset(
|
2014-03-01 00:39:27 +00:00
|
|
|
new FlushBlockBySizePolicyFactory());
|
2014-08-25 21:22:05 +00:00
|
|
|
table_options_.block_size = 256;
|
2020-06-13 00:29:22 +00:00
|
|
|
table_options_.block_restart_interval = args_.restart_interval;
|
|
|
|
table_options_.index_block_restart_interval = args_.restart_interval;
|
|
|
|
table_options_.format_version = args_.format_version;
|
2014-08-25 21:22:05 +00:00
|
|
|
options_.table_factory.reset(
|
|
|
|
new BlockBasedTableFactory(table_options_));
|
2020-06-13 00:29:22 +00:00
|
|
|
constructor_.reset(new TableConstructor(
|
|
|
|
options_.comparator, true /* convert_to_internal_key_ */));
|
2016-08-19 22:10:31 +00:00
|
|
|
internal_comparator_.reset(
|
|
|
|
new InternalKeyComparator(options_.comparator));
|
2013-12-20 17:35:24 +00:00
|
|
|
break;
|
2023-01-27 21:14:19 +00:00
|
|
|
|
2013-12-20 17:35:24 +00:00
|
|
|
case PLAIN_TABLE_SEMI_FIXED_PREFIX:
|
|
|
|
support_prev_ = false;
|
|
|
|
only_support_prefix_seek_ = true;
|
2014-03-10 19:56:46 +00:00
|
|
|
options_.prefix_extractor.reset(new FixedOrLessPrefixTransform(2));
|
2014-02-08 00:25:38 +00:00
|
|
|
options_.table_factory.reset(NewPlainTableFactory());
|
2020-06-13 00:29:22 +00:00
|
|
|
constructor_.reset(new TableConstructor(
|
|
|
|
options_.comparator, true /* convert_to_internal_key_ */));
|
2014-01-27 21:53:22 +00:00
|
|
|
internal_comparator_.reset(
|
|
|
|
new InternalKeyComparator(options_.comparator));
|
2013-12-20 17:35:24 +00:00
|
|
|
break;
|
|
|
|
case PLAIN_TABLE_FULL_STR_PREFIX:
|
|
|
|
support_prev_ = false;
|
|
|
|
only_support_prefix_seek_ = true;
|
2014-03-10 19:56:46 +00:00
|
|
|
options_.prefix_extractor.reset(NewNoopTransform());
|
2014-02-08 00:25:38 +00:00
|
|
|
options_.table_factory.reset(NewPlainTableFactory());
|
2020-06-13 00:29:22 +00:00
|
|
|
constructor_.reset(new TableConstructor(
|
|
|
|
options_.comparator, true /* convert_to_internal_key_ */));
|
2014-02-08 00:25:38 +00:00
|
|
|
internal_comparator_.reset(
|
|
|
|
new InternalKeyComparator(options_.comparator));
|
|
|
|
break;
|
|
|
|
case PLAIN_TABLE_TOTAL_ORDER:
|
|
|
|
support_prev_ = false;
|
|
|
|
only_support_prefix_seek_ = false;
|
|
|
|
options_.prefix_extractor = nullptr;
|
2014-07-18 07:08:38 +00:00
|
|
|
|
|
|
|
{
|
|
|
|
PlainTableOptions plain_table_options;
|
|
|
|
plain_table_options.user_key_len = kPlainTableVariableLength;
|
|
|
|
plain_table_options.bloom_bits_per_key = 0;
|
|
|
|
plain_table_options.hash_table_ratio = 0;
|
|
|
|
|
|
|
|
options_.table_factory.reset(
|
|
|
|
NewPlainTableFactory(plain_table_options));
|
|
|
|
}
|
2020-06-13 00:29:22 +00:00
|
|
|
constructor_.reset(new TableConstructor(
|
|
|
|
options_.comparator, true /* convert_to_internal_key_ */));
|
2014-01-27 21:53:22 +00:00
|
|
|
internal_comparator_.reset(
|
|
|
|
new InternalKeyComparator(options_.comparator));
|
2011-03-18 22:37:00 +00:00
|
|
|
break;
|
|
|
|
case BLOCK_TEST:
|
2014-08-25 21:22:05 +00:00
|
|
|
table_options_.block_size = 256;
|
|
|
|
options_.table_factory.reset(
|
|
|
|
new BlockBasedTableFactory(table_options_));
|
2020-06-13 00:29:22 +00:00
|
|
|
constructor_.reset(new BlockConstructor(options_.comparator));
|
2011-03-18 22:37:00 +00:00
|
|
|
break;
|
|
|
|
case MEMTABLE_TEST:
|
2014-08-25 21:22:05 +00:00
|
|
|
table_options_.block_size = 256;
|
|
|
|
options_.table_factory.reset(
|
|
|
|
new BlockBasedTableFactory(table_options_));
|
2020-06-13 00:29:22 +00:00
|
|
|
constructor_.reset(
|
|
|
|
new MemTableConstructor(options_.comparator, &write_buffer_));
|
2011-03-18 22:37:00 +00:00
|
|
|
break;
|
|
|
|
case DB_TEST:
|
2014-08-25 21:22:05 +00:00
|
|
|
table_options_.block_size = 256;
|
|
|
|
options_.table_factory.reset(
|
|
|
|
new BlockBasedTableFactory(table_options_));
|
2020-06-13 00:29:22 +00:00
|
|
|
constructor_.reset(new DBConstructor(options_.comparator));
|
2011-03-18 22:37:00 +00:00
|
|
|
break;
|
|
|
|
}
|
2021-05-05 20:59:21 +00:00
|
|
|
ioptions_ = ImmutableOptions(options_);
|
2018-05-21 21:33:55 +00:00
|
|
|
moptions_ = MutableCFOptions(options_);
|
2011-03-18 22:37:00 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void Add(const std::string& key, const std::string& value) {
|
|
|
|
constructor_->Add(key, value);
|
|
|
|
}
|
|
|
|
|
|
|
|
void Test(Random* rnd) {
|
|
|
|
std::vector<std::string> keys;
|
2015-09-02 20:58:22 +00:00
|
|
|
stl_wrappers::KVMap data;
|
2018-05-21 21:33:55 +00:00
|
|
|
constructor_->Finish(options_, ioptions_, moptions_, table_options_,
|
2014-09-04 23:18:36 +00:00
|
|
|
*internal_comparator_, &keys, &data);
|
2011-03-18 22:37:00 +00:00
|
|
|
|
|
|
|
TestForwardScan(keys, data);
|
2013-12-20 17:35:24 +00:00
|
|
|
if (support_prev_) {
|
|
|
|
TestBackwardScan(keys, data);
|
|
|
|
}
|
2011-03-18 22:37:00 +00:00
|
|
|
TestRandomAccess(rnd, keys, data);
|
|
|
|
}
|
|
|
|
|
2018-03-05 21:08:17 +00:00
|
|
|
void TestForwardScan(const std::vector<std::string>& /*keys*/,
|
2015-09-02 20:58:22 +00:00
|
|
|
const stl_wrappers::KVMap& data) {
|
2015-10-12 22:06:38 +00:00
|
|
|
InternalIterator* iter = constructor_->NewIterator();
|
2011-03-18 22:37:00 +00:00
|
|
|
ASSERT_TRUE(!iter->Valid());
|
|
|
|
iter->SeekToFirst();
|
2020-10-08 18:11:52 +00:00
|
|
|
ASSERT_OK(iter->status());
|
2015-09-02 20:58:22 +00:00
|
|
|
for (stl_wrappers::KVMap::const_iterator model_iter = data.begin();
|
|
|
|
model_iter != data.end(); ++model_iter) {
|
2011-03-18 22:37:00 +00:00
|
|
|
ASSERT_EQ(ToString(data, model_iter), ToString(iter));
|
|
|
|
iter->Next();
|
2020-10-08 18:11:52 +00:00
|
|
|
ASSERT_OK(iter->status());
|
2011-03-18 22:37:00 +00:00
|
|
|
}
|
|
|
|
ASSERT_TRUE(!iter->Valid());
|
2020-10-08 18:11:52 +00:00
|
|
|
ASSERT_OK(iter->status());
|
2014-09-05 00:40:41 +00:00
|
|
|
if (constructor_->IsArenaMode() && !constructor_->AnywayDeleteIterator()) {
|
2015-10-12 22:06:38 +00:00
|
|
|
iter->~InternalIterator();
|
2014-09-05 00:40:41 +00:00
|
|
|
} else {
|
|
|
|
delete iter;
|
|
|
|
}
|
2011-03-18 22:37:00 +00:00
|
|
|
}
|
|
|
|
|
2018-03-05 21:08:17 +00:00
|
|
|
void TestBackwardScan(const std::vector<std::string>& /*keys*/,
|
2015-09-02 20:58:22 +00:00
|
|
|
const stl_wrappers::KVMap& data) {
|
2015-10-12 22:06:38 +00:00
|
|
|
InternalIterator* iter = constructor_->NewIterator();
|
2011-03-18 22:37:00 +00:00
|
|
|
ASSERT_TRUE(!iter->Valid());
|
|
|
|
iter->SeekToLast();
|
2020-10-08 18:11:52 +00:00
|
|
|
ASSERT_OK(iter->status());
|
2015-09-02 20:58:22 +00:00
|
|
|
for (stl_wrappers::KVMap::const_reverse_iterator model_iter = data.rbegin();
|
|
|
|
model_iter != data.rend(); ++model_iter) {
|
2011-03-18 22:37:00 +00:00
|
|
|
ASSERT_EQ(ToString(data, model_iter), ToString(iter));
|
|
|
|
iter->Prev();
|
2020-10-08 18:11:52 +00:00
|
|
|
ASSERT_OK(iter->status());
|
2011-03-18 22:37:00 +00:00
|
|
|
}
|
|
|
|
ASSERT_TRUE(!iter->Valid());
|
2020-10-08 18:11:52 +00:00
|
|
|
ASSERT_OK(iter->status());
|
2014-09-05 00:40:41 +00:00
|
|
|
if (constructor_->IsArenaMode() && !constructor_->AnywayDeleteIterator()) {
|
2015-10-12 22:06:38 +00:00
|
|
|
iter->~InternalIterator();
|
2014-09-05 00:40:41 +00:00
|
|
|
} else {
|
|
|
|
delete iter;
|
|
|
|
}
|
2011-03-18 22:37:00 +00:00
|
|
|
}
|
|
|
|
|
2015-09-02 20:58:22 +00:00
|
|
|
void TestRandomAccess(Random* rnd, const std::vector<std::string>& keys,
|
|
|
|
const stl_wrappers::KVMap& data) {
|
2011-03-18 22:37:00 +00:00
|
|
|
static const bool kVerbose = false;
|
2015-10-12 22:06:38 +00:00
|
|
|
InternalIterator* iter = constructor_->NewIterator();
|
2011-03-18 22:37:00 +00:00
|
|
|
ASSERT_TRUE(!iter->Valid());
|
2015-09-02 20:58:22 +00:00
|
|
|
stl_wrappers::KVMap::const_iterator model_iter = data.begin();
|
2023-12-01 19:15:17 +00:00
|
|
|
if (kVerbose) {
|
|
|
|
fprintf(stderr, "---\n");
|
|
|
|
}
|
2011-03-18 22:37:00 +00:00
|
|
|
for (int i = 0; i < 200; i++) {
|
2013-12-20 17:35:24 +00:00
|
|
|
const int toss = rnd->Uniform(support_prev_ ? 5 : 3);
|
2011-03-18 22:37:00 +00:00
|
|
|
switch (toss) {
|
|
|
|
case 0: {
|
|
|
|
if (iter->Valid()) {
|
2023-12-01 19:15:17 +00:00
|
|
|
if (kVerbose) {
|
|
|
|
fprintf(stderr, "Next\n");
|
|
|
|
}
|
2011-03-18 22:37:00 +00:00
|
|
|
iter->Next();
|
2020-10-08 18:11:52 +00:00
|
|
|
ASSERT_OK(iter->status());
|
2011-03-18 22:37:00 +00:00
|
|
|
++model_iter;
|
|
|
|
ASSERT_EQ(ToString(data, model_iter), ToString(iter));
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
case 1: {
|
2023-12-01 19:15:17 +00:00
|
|
|
if (kVerbose) {
|
|
|
|
fprintf(stderr, "SeekToFirst\n");
|
|
|
|
}
|
2011-03-18 22:37:00 +00:00
|
|
|
iter->SeekToFirst();
|
2020-10-08 18:11:52 +00:00
|
|
|
ASSERT_OK(iter->status());
|
2011-03-18 22:37:00 +00:00
|
|
|
model_iter = data.begin();
|
|
|
|
ASSERT_EQ(ToString(data, model_iter), ToString(iter));
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
case 2: {
|
|
|
|
std::string key = PickRandomKey(rnd, keys);
|
|
|
|
model_iter = data.lower_bound(key);
|
2023-12-01 19:15:17 +00:00
|
|
|
if (kVerbose) {
|
2022-10-25 18:50:38 +00:00
|
|
|
fprintf(stderr, "Seek '%s'\n", EscapeString(key).c_str());
|
2023-12-01 19:15:17 +00:00
|
|
|
}
|
2011-03-18 22:37:00 +00:00
|
|
|
iter->Seek(Slice(key));
|
2020-10-08 18:11:52 +00:00
|
|
|
ASSERT_OK(iter->status());
|
2011-03-18 22:37:00 +00:00
|
|
|
ASSERT_EQ(ToString(data, model_iter), ToString(iter));
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
case 3: {
|
|
|
|
if (iter->Valid()) {
|
2023-12-01 19:15:17 +00:00
|
|
|
if (kVerbose) {
|
|
|
|
fprintf(stderr, "Prev\n");
|
|
|
|
}
|
2011-03-18 22:37:00 +00:00
|
|
|
iter->Prev();
|
2020-10-08 18:11:52 +00:00
|
|
|
ASSERT_OK(iter->status());
|
2011-03-18 22:37:00 +00:00
|
|
|
if (model_iter == data.begin()) {
|
2022-10-25 18:50:38 +00:00
|
|
|
model_iter = data.end(); // Wrap around to invalid value
|
2011-03-18 22:37:00 +00:00
|
|
|
} else {
|
|
|
|
--model_iter;
|
|
|
|
}
|
|
|
|
ASSERT_EQ(ToString(data, model_iter), ToString(iter));
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
case 4: {
|
2023-12-01 19:15:17 +00:00
|
|
|
if (kVerbose) {
|
|
|
|
fprintf(stderr, "SeekToLast\n");
|
|
|
|
}
|
2011-03-18 22:37:00 +00:00
|
|
|
iter->SeekToLast();
|
2020-10-08 18:11:52 +00:00
|
|
|
ASSERT_OK(iter->status());
|
2011-03-18 22:37:00 +00:00
|
|
|
if (keys.empty()) {
|
|
|
|
model_iter = data.end();
|
|
|
|
} else {
|
|
|
|
std::string last = data.rbegin()->first;
|
|
|
|
model_iter = data.lower_bound(last);
|
|
|
|
}
|
|
|
|
ASSERT_EQ(ToString(data, model_iter), ToString(iter));
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2014-09-05 00:40:41 +00:00
|
|
|
if (constructor_->IsArenaMode() && !constructor_->AnywayDeleteIterator()) {
|
2015-10-12 22:06:38 +00:00
|
|
|
iter->~InternalIterator();
|
2014-09-05 00:40:41 +00:00
|
|
|
} else {
|
|
|
|
delete iter;
|
|
|
|
}
|
2011-03-18 22:37:00 +00:00
|
|
|
}
|
|
|
|
|
2015-09-02 20:58:22 +00:00
|
|
|
std::string ToString(const stl_wrappers::KVMap& data,
|
|
|
|
const stl_wrappers::KVMap::const_iterator& it) {
|
2011-03-18 22:37:00 +00:00
|
|
|
if (it == data.end()) {
|
|
|
|
return "END";
|
|
|
|
} else {
|
|
|
|
return "'" + it->first + "->" + it->second + "'";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-09-02 20:58:22 +00:00
|
|
|
std::string ToString(const stl_wrappers::KVMap& data,
|
|
|
|
const stl_wrappers::KVMap::const_reverse_iterator& it) {
|
2011-03-18 22:37:00 +00:00
|
|
|
if (it == data.rend()) {
|
|
|
|
return "END";
|
|
|
|
} else {
|
|
|
|
return "'" + it->first + "->" + it->second + "'";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-10-12 22:06:38 +00:00
|
|
|
std::string ToString(const InternalIterator* it) {
|
2011-03-18 22:37:00 +00:00
|
|
|
if (!it->Valid()) {
|
|
|
|
return "END";
|
|
|
|
} else {
|
|
|
|
return "'" + it->key().ToString() + "->" + it->value().ToString() + "'";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string PickRandomKey(Random* rnd, const std::vector<std::string>& keys) {
|
|
|
|
if (keys.empty()) {
|
|
|
|
return "foo";
|
|
|
|
} else {
|
2014-11-11 21:47:22 +00:00
|
|
|
const int index = rnd->Uniform(static_cast<int>(keys.size()));
|
2011-03-18 22:37:00 +00:00
|
|
|
std::string result = keys[index];
|
2013-12-20 17:35:24 +00:00
|
|
|
switch (rnd->Uniform(support_prev_ ? 3 : 1)) {
|
2011-03-18 22:37:00 +00:00
|
|
|
case 0:
|
|
|
|
// Return an existing key
|
|
|
|
break;
|
|
|
|
case 1: {
|
|
|
|
// Attempt to return something smaller than an existing key
|
2022-10-25 18:50:38 +00:00
|
|
|
if (result.size() > 0 && result[result.size() - 1] > '\0' &&
|
|
|
|
(!only_support_prefix_seek_ ||
|
|
|
|
options_.prefix_extractor->Transform(result).size() <
|
|
|
|
result.size())) {
|
2013-12-20 17:35:24 +00:00
|
|
|
result[result.size() - 1]--;
|
2011-03-18 22:37:00 +00:00
|
|
|
}
|
|
|
|
break;
|
2022-10-25 18:50:38 +00:00
|
|
|
}
|
2011-03-18 22:37:00 +00:00
|
|
|
case 2: {
|
|
|
|
// Return something larger than an existing key
|
|
|
|
Increment(options_.comparator, &result);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-03-01 02:04:58 +00:00
|
|
|
// Returns nullptr if not running against a DB
|
2011-03-21 19:40:57 +00:00
|
|
|
DB* db() const { return constructor_->db(); }
|
|
|
|
|
2011-03-18 22:37:00 +00:00
|
|
|
private:
|
2020-06-13 00:29:22 +00:00
|
|
|
TestArgs args_;
|
|
|
|
Options options_;
|
2021-05-05 20:59:21 +00:00
|
|
|
ImmutableOptions ioptions_;
|
2018-05-21 21:33:55 +00:00
|
|
|
MutableCFOptions moptions_;
|
2020-06-13 00:29:22 +00:00
|
|
|
BlockBasedTableOptions table_options_;
|
|
|
|
std::unique_ptr<Constructor> constructor_;
|
2016-06-21 01:01:03 +00:00
|
|
|
WriteBufferManager write_buffer_;
|
2013-12-20 17:35:24 +00:00
|
|
|
bool support_prev_;
|
|
|
|
bool only_support_prefix_seek_;
|
2018-11-09 19:17:34 +00:00
|
|
|
std::shared_ptr<InternalKeyComparator> internal_comparator_;
|
2011-03-18 22:37:00 +00:00
|
|
|
};
|
|
|
|
|
2020-06-13 00:29:22 +00:00
|
|
|
class ParameterizedHarnessTest : public HarnessTest,
|
|
|
|
public testing::WithParamInterface<TestArgs> {
|
|
|
|
public:
|
|
|
|
ParameterizedHarnessTest() : HarnessTest(GetParam()) {}
|
|
|
|
};
|
|
|
|
|
|
|
|
INSTANTIATE_TEST_CASE_P(TableTest, ParameterizedHarnessTest,
|
|
|
|
::testing::ValuesIn(GenerateArgList()));
|
|
|
|
|
|
|
|
class DBHarnessTest : public HarnessTest {
|
|
|
|
public:
|
|
|
|
DBHarnessTest()
|
|
|
|
: HarnessTest(TestArgs{DB_TEST, /* reverse_compare */ false,
|
|
|
|
/* restart_interval */ 16, kNoCompression,
|
|
|
|
/* compression_parallel_threads */ 1,
|
|
|
|
/* format_version */ 0, /* use_mmap */ false}) {}
|
|
|
|
};
|
|
|
|
|
2011-03-18 22:37:00 +00:00
|
|
|
static bool Between(uint64_t val, uint64_t low, uint64_t high) {
|
|
|
|
bool result = (val >= low) && (val <= high);
|
|
|
|
if (!result) {
|
|
|
|
fprintf(stderr, "Value %llu is not in range [%llu, %llu]\n",
|
2022-10-25 18:50:38 +00:00
|
|
|
(unsigned long long)(val), (unsigned long long)(low),
|
2011-03-18 22:37:00 +00:00
|
|
|
(unsigned long long)(high));
|
|
|
|
}
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2014-01-24 20:14:08 +00:00
|
|
|
// Tests against all kinds of tables
|
2015-03-17 21:08:00 +00:00
|
|
|
class TableTest : public testing::Test {
|
2014-01-27 21:53:22 +00:00
|
|
|
public:
|
|
|
|
const InternalKeyComparator& GetPlainInternalComparator(
|
|
|
|
const Comparator* comp) {
|
|
|
|
if (!plain_internal_comparator) {
|
|
|
|
plain_internal_comparator.reset(
|
|
|
|
new test::PlainInternalKeyComparator(comp));
|
|
|
|
}
|
|
|
|
return *plain_internal_comparator;
|
|
|
|
}
|
2017-02-07 00:29:29 +00:00
|
|
|
void IndexTest(BlockBasedTableOptions table_options);
|
2014-01-27 21:53:22 +00:00
|
|
|
|
|
|
|
private:
|
|
|
|
std::unique_ptr<InternalKeyComparator> plain_internal_comparator;
|
|
|
|
};
|
|
|
|
|
|
|
|
class GeneralTableTest : public TableTest {};
|
2022-05-17 22:01:51 +00:00
|
|
|
class BlockBasedTableTestBase : public TableTest {};
|
2018-06-05 02:59:44 +00:00
|
|
|
class BlockBasedTableTest
|
2022-05-17 22:01:51 +00:00
|
|
|
: public BlockBasedTableTestBase,
|
2018-06-05 02:59:44 +00:00
|
|
|
virtual public ::testing::WithParamInterface<uint32_t> {
|
|
|
|
public:
|
2023-12-06 21:48:15 +00:00
|
|
|
BlockBasedTableTest() : format_(GetParam()) { env_ = Env::Default(); }
|
2018-06-05 02:59:44 +00:00
|
|
|
|
|
|
|
BlockBasedTableOptions GetBlockBasedTableOptions() {
|
|
|
|
BlockBasedTableOptions options;
|
|
|
|
options.format_version = format_;
|
|
|
|
return options;
|
|
|
|
}
|
|
|
|
|
2019-07-17 20:02:00 +00:00
|
|
|
void SetupTracingTest(TableConstructor* c) {
|
|
|
|
test_path_ = test::PerThreadDBPath("block_based_table_tracing_test");
|
|
|
|
EXPECT_OK(env_->CreateDir(test_path_));
|
|
|
|
trace_file_path_ = test_path_ + "/block_cache_trace_file";
|
2022-10-21 19:15:35 +00:00
|
|
|
|
|
|
|
BlockCacheTraceWriterOptions trace_writer_opt;
|
|
|
|
BlockCacheTraceOptions trace_opt;
|
2019-07-17 20:02:00 +00:00
|
|
|
std::unique_ptr<TraceWriter> trace_writer;
|
|
|
|
EXPECT_OK(NewFileTraceWriter(env_, EnvOptions(), trace_file_path_,
|
|
|
|
&trace_writer));
|
2022-10-21 19:15:35 +00:00
|
|
|
std::unique_ptr<BlockCacheTraceWriter> block_cache_trace_writer =
|
|
|
|
NewBlockCacheTraceWriter(env_->GetSystemClock().get(), trace_writer_opt,
|
|
|
|
std::move(trace_writer));
|
|
|
|
ASSERT_NE(block_cache_trace_writer, nullptr);
|
2020-10-08 18:11:52 +00:00
|
|
|
// Always return Status::OK().
|
Refactor block cache tracing w/improved MultiGet (#11339)
Summary:
After https://github.com/facebook/rocksdb/issues/11301, I wasn't sure whether I had regressed block cache tracing with MultiGet. Demo PR https://github.com/facebook/rocksdb/issues/11330 shows the flawed state of tracing MultiGet before my change, and based on the unit test, there was essentially no change in tracing behavior with https://github.com/facebook/rocksdb/issues/11301. This change is to leave that code and behavior better than I found it.
This change is not intended to change any production behaviors except when block cache tracing is active, though might improve general read path efficiency by disabling some related tracking when such tracing is disabled.
More detail on production code:
* Refactoring to consolidate the construction of BlockCacheTraceRecord, and other related functionality, in block-based table reader, though it's somewhat awkward to preserve an optimization to avoid copying Slices into temporary strings in BlockCacheLookupContext.
* Accurately track cache hits and misses (etc.) for each data block accessed by a MultiGet(). (Previously reported hits as misses.)
* Reduced repeated checking of `block_cache_tracer_` state (by creating lookup_context only when active) for efficiency and to reduce the risk of corner case bugs where tracing is enabled or disabled for different parts of a read op. (See a TODO below)
* Improved estimate calculation for num_keys_in_block (see code comment)
Possible follow-up:
* `XXX:` use_cache=true means double cache query? (possible double-query of block cache when allow_mmap_reads=true)
* `TODO:` need more than one lookup_context here to track individual filter and index partition hits and misses
* `TODO:` optimize more state checks of `block_cache_tracer_` down to `lookup_context != nullptr`
* Pre-existing `XXX:` There appear to be 'break' statements above that bypass this writing of the block cache trace record
* Expand test coverage (see below)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11339
Test Plan:
* Added a basic unit test for block cache tracing MultiGet, for now just covering one data block with two keys.
* Added HitMissCountingCache to independently verify that the actual block cache trace and expected block cache trace also agree with the actual number of cache hits / misses (nothing missing or mislabeled). For now only used with MultiGet test.
* Better testing of num_keys_in_block, for now just with MultiGet
* Misc improvements to table_test to improve clarity, such as making it clear that certain keys are auto-inserted at the start of every test.
Performance test:
Testing multireadrandom as in https://github.com/facebook/rocksdb/issues/11301, except averaging over distinct runs rather than [-X30] which doesn't seem to sufficiently reset after each run to work as an independent test run.
Base with revert of 11301: 3148926 ops/sec
Base: 3019146 ops/sec
New: 2999529 ops/sec
Possibly a tiny MultiGet CPU regression with this change. We are now always allocating an additional vector for the LookupContexts. I'm still contemplating options to try to correct the regression in https://github.com/facebook/rocksdb/issues/11301.
Testing readrandom:
Base with revert of 11301: 2311988
Base: 2281726
New: 2299722
Possibly a tiny Get CPU improvement with this change. We are now avoiding some unnecessary LookupContext population.
Reviewed By: akankshamahajan15
Differential Revision: D44557845
Pulled By: pdillinger
fbshipit-source-id: b841691799d2a48fb59cc8880dc7cbb1e107ae3d
2023-04-07 19:55:56 +00:00
|
|
|
ASSERT_OK(c->block_cache_tracer_.StartTrace(
|
|
|
|
trace_opt, std::move(block_cache_trace_writer)));
|
2022-10-21 19:15:35 +00:00
|
|
|
|
2019-07-17 20:02:00 +00:00
|
|
|
{
|
Refactor block cache tracing w/improved MultiGet (#11339)
Summary:
After https://github.com/facebook/rocksdb/issues/11301, I wasn't sure whether I had regressed block cache tracing with MultiGet. Demo PR https://github.com/facebook/rocksdb/issues/11330 shows the flawed state of tracing MultiGet before my change, and based on the unit test, there was essentially no change in tracing behavior with https://github.com/facebook/rocksdb/issues/11301. This change is to leave that code and behavior better than I found it.
This change is not intended to change any production behaviors except when block cache tracing is active, though might improve general read path efficiency by disabling some related tracking when such tracing is disabled.
More detail on production code:
* Refactoring to consolidate the construction of BlockCacheTraceRecord, and other related functionality, in block-based table reader, though it's somewhat awkward to preserve an optimization to avoid copying Slices into temporary strings in BlockCacheLookupContext.
* Accurately track cache hits and misses (etc.) for each data block accessed by a MultiGet(). (Previously reported hits as misses.)
* Reduced repeated checking of `block_cache_tracer_` state (by creating lookup_context only when active) for efficiency and to reduce the risk of corner case bugs where tracing is enabled or disabled for different parts of a read op. (See a TODO below)
* Improved estimate calculation for num_keys_in_block (see code comment)
Possible follow-up:
* `XXX:` use_cache=true means double cache query? (possible double-query of block cache when allow_mmap_reads=true)
* `TODO:` need more than one lookup_context here to track individual filter and index partition hits and misses
* `TODO:` optimize more state checks of `block_cache_tracer_` down to `lookup_context != nullptr`
* Pre-existing `XXX:` There appear to be 'break' statements above that bypass this writing of the block cache trace record
* Expand test coverage (see below)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11339
Test Plan:
* Added a basic unit test for block cache tracing MultiGet, for now just covering one data block with two keys.
* Added HitMissCountingCache to independently verify that the actual block cache trace and expected block cache trace also agree with the actual number of cache hits / misses (nothing missing or mislabeled). For now only used with MultiGet test.
* Better testing of num_keys_in_block, for now just with MultiGet
* Misc improvements to table_test to improve clarity, such as making it clear that certain keys are auto-inserted at the start of every test.
Performance test:
Testing multireadrandom as in https://github.com/facebook/rocksdb/issues/11301, except averaging over distinct runs rather than [-X30] which doesn't seem to sufficiently reset after each run to work as an independent test run.
Base with revert of 11301: 3148926 ops/sec
Base: 3019146 ops/sec
New: 2999529 ops/sec
Possibly a tiny MultiGet CPU regression with this change. We are now always allocating an additional vector for the LookupContexts. I'm still contemplating options to try to correct the regression in https://github.com/facebook/rocksdb/issues/11301.
Testing readrandom:
Base with revert of 11301: 2311988
Base: 2281726
New: 2299722
Possibly a tiny Get CPU improvement with this change. We are now avoiding some unnecessary LookupContext population.
Reviewed By: akankshamahajan15
Differential Revision: D44557845
Pulled By: pdillinger
fbshipit-source-id: b841691799d2a48fb59cc8880dc7cbb1e107ae3d
2023-04-07 19:55:56 +00:00
|
|
|
InternalKey internal_key(auto_add_key1, 0, kTypeValue);
|
2019-07-17 20:02:00 +00:00
|
|
|
std::string encoded_key = internal_key.Encode().ToString();
|
|
|
|
c->Add(encoded_key, kDummyValue);
|
|
|
|
}
|
|
|
|
{
|
Refactor block cache tracing w/improved MultiGet (#11339)
Summary:
After https://github.com/facebook/rocksdb/issues/11301, I wasn't sure whether I had regressed block cache tracing with MultiGet. Demo PR https://github.com/facebook/rocksdb/issues/11330 shows the flawed state of tracing MultiGet before my change, and based on the unit test, there was essentially no change in tracing behavior with https://github.com/facebook/rocksdb/issues/11301. This change is to leave that code and behavior better than I found it.
This change is not intended to change any production behaviors except when block cache tracing is active, though might improve general read path efficiency by disabling some related tracking when such tracing is disabled.
More detail on production code:
* Refactoring to consolidate the construction of BlockCacheTraceRecord, and other related functionality, in block-based table reader, though it's somewhat awkward to preserve an optimization to avoid copying Slices into temporary strings in BlockCacheLookupContext.
* Accurately track cache hits and misses (etc.) for each data block accessed by a MultiGet(). (Previously reported hits as misses.)
* Reduced repeated checking of `block_cache_tracer_` state (by creating lookup_context only when active) for efficiency and to reduce the risk of corner case bugs where tracing is enabled or disabled for different parts of a read op. (See a TODO below)
* Improved estimate calculation for num_keys_in_block (see code comment)
Possible follow-up:
* `XXX:` use_cache=true means double cache query? (possible double-query of block cache when allow_mmap_reads=true)
* `TODO:` need more than one lookup_context here to track individual filter and index partition hits and misses
* `TODO:` optimize more state checks of `block_cache_tracer_` down to `lookup_context != nullptr`
* Pre-existing `XXX:` There appear to be 'break' statements above that bypass this writing of the block cache trace record
* Expand test coverage (see below)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11339
Test Plan:
* Added a basic unit test for block cache tracing MultiGet, for now just covering one data block with two keys.
* Added HitMissCountingCache to independently verify that the actual block cache trace and expected block cache trace also agree with the actual number of cache hits / misses (nothing missing or mislabeled). For now only used with MultiGet test.
* Better testing of num_keys_in_block, for now just with MultiGet
* Misc improvements to table_test to improve clarity, such as making it clear that certain keys are auto-inserted at the start of every test.
Performance test:
Testing multireadrandom as in https://github.com/facebook/rocksdb/issues/11301, except averaging over distinct runs rather than [-X30] which doesn't seem to sufficiently reset after each run to work as an independent test run.
Base with revert of 11301: 3148926 ops/sec
Base: 3019146 ops/sec
New: 2999529 ops/sec
Possibly a tiny MultiGet CPU regression with this change. We are now always allocating an additional vector for the LookupContexts. I'm still contemplating options to try to correct the regression in https://github.com/facebook/rocksdb/issues/11301.
Testing readrandom:
Base with revert of 11301: 2311988
Base: 2281726
New: 2299722
Possibly a tiny Get CPU improvement with this change. We are now avoiding some unnecessary LookupContext population.
Reviewed By: akankshamahajan15
Differential Revision: D44557845
Pulled By: pdillinger
fbshipit-source-id: b841691799d2a48fb59cc8880dc7cbb1e107ae3d
2023-04-07 19:55:56 +00:00
|
|
|
InternalKey internal_key(auto_add_key2, 0, kTypeValue);
|
2019-07-17 20:02:00 +00:00
|
|
|
std::string encoded_key = internal_key.Encode().ToString();
|
|
|
|
c->Add(encoded_key, kDummyValue);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void VerifyBlockAccessTrace(
|
|
|
|
TableConstructor* c,
|
|
|
|
const std::vector<BlockCacheTraceRecord>& expected_records) {
|
|
|
|
c->block_cache_tracer_.EndTrace();
|
|
|
|
|
2020-04-03 16:52:38 +00:00
|
|
|
{
|
|
|
|
std::unique_ptr<TraceReader> trace_reader;
|
2022-10-25 18:50:38 +00:00
|
|
|
Status s = NewFileTraceReader(env_, EnvOptions(), trace_file_path_,
|
|
|
|
&trace_reader);
|
2020-04-03 16:52:38 +00:00
|
|
|
EXPECT_OK(s);
|
|
|
|
BlockCacheTraceReader reader(std::move(trace_reader));
|
|
|
|
BlockCacheTraceHeader header;
|
|
|
|
EXPECT_OK(reader.ReadHeader(&header));
|
|
|
|
uint32_t index = 0;
|
|
|
|
while (s.ok()) {
|
Refactor block cache tracing w/improved MultiGet (#11339)
Summary:
After https://github.com/facebook/rocksdb/issues/11301, I wasn't sure whether I had regressed block cache tracing with MultiGet. Demo PR https://github.com/facebook/rocksdb/issues/11330 shows the flawed state of tracing MultiGet before my change, and based on the unit test, there was essentially no change in tracing behavior with https://github.com/facebook/rocksdb/issues/11301. This change is to leave that code and behavior better than I found it.
This change is not intended to change any production behaviors except when block cache tracing is active, though might improve general read path efficiency by disabling some related tracking when such tracing is disabled.
More detail on production code:
* Refactoring to consolidate the construction of BlockCacheTraceRecord, and other related functionality, in block-based table reader, though it's somewhat awkward to preserve an optimization to avoid copying Slices into temporary strings in BlockCacheLookupContext.
* Accurately track cache hits and misses (etc.) for each data block accessed by a MultiGet(). (Previously reported hits as misses.)
* Reduced repeated checking of `block_cache_tracer_` state (by creating lookup_context only when active) for efficiency and to reduce the risk of corner case bugs where tracing is enabled or disabled for different parts of a read op. (See a TODO below)
* Improved estimate calculation for num_keys_in_block (see code comment)
Possible follow-up:
* `XXX:` use_cache=true means double cache query? (possible double-query of block cache when allow_mmap_reads=true)
* `TODO:` need more than one lookup_context here to track individual filter and index partition hits and misses
* `TODO:` optimize more state checks of `block_cache_tracer_` down to `lookup_context != nullptr`
* Pre-existing `XXX:` There appear to be 'break' statements above that bypass this writing of the block cache trace record
* Expand test coverage (see below)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11339
Test Plan:
* Added a basic unit test for block cache tracing MultiGet, for now just covering one data block with two keys.
* Added HitMissCountingCache to independently verify that the actual block cache trace and expected block cache trace also agree with the actual number of cache hits / misses (nothing missing or mislabeled). For now only used with MultiGet test.
* Better testing of num_keys_in_block, for now just with MultiGet
* Misc improvements to table_test to improve clarity, such as making it clear that certain keys are auto-inserted at the start of every test.
Performance test:
Testing multireadrandom as in https://github.com/facebook/rocksdb/issues/11301, except averaging over distinct runs rather than [-X30] which doesn't seem to sufficiently reset after each run to work as an independent test run.
Base with revert of 11301: 3148926 ops/sec
Base: 3019146 ops/sec
New: 2999529 ops/sec
Possibly a tiny MultiGet CPU regression with this change. We are now always allocating an additional vector for the LookupContexts. I'm still contemplating options to try to correct the regression in https://github.com/facebook/rocksdb/issues/11301.
Testing readrandom:
Base with revert of 11301: 2311988
Base: 2281726
New: 2299722
Possibly a tiny Get CPU improvement with this change. We are now avoiding some unnecessary LookupContext population.
Reviewed By: akankshamahajan15
Differential Revision: D44557845
Pulled By: pdillinger
fbshipit-source-id: b841691799d2a48fb59cc8880dc7cbb1e107ae3d
2023-04-07 19:55:56 +00:00
|
|
|
SCOPED_TRACE("expected_records[" + std::to_string(index) + "]");
|
2020-04-03 16:52:38 +00:00
|
|
|
BlockCacheTraceRecord access;
|
|
|
|
s = reader.ReadAccess(&access);
|
|
|
|
if (!s.ok()) {
|
|
|
|
break;
|
2019-07-17 20:02:00 +00:00
|
|
|
}
|
2020-04-03 16:52:38 +00:00
|
|
|
ASSERT_LT(index, expected_records.size());
|
|
|
|
EXPECT_NE("", access.block_key);
|
|
|
|
EXPECT_EQ(access.block_type, expected_records[index].block_type);
|
|
|
|
EXPECT_GT(access.block_size, 0);
|
|
|
|
EXPECT_EQ(access.caller, expected_records[index].caller);
|
|
|
|
EXPECT_EQ(access.no_insert, expected_records[index].no_insert);
|
|
|
|
EXPECT_EQ(access.is_cache_hit, expected_records[index].is_cache_hit);
|
Refactor block cache tracing w/improved MultiGet (#11339)
Summary:
After https://github.com/facebook/rocksdb/issues/11301, I wasn't sure whether I had regressed block cache tracing with MultiGet. Demo PR https://github.com/facebook/rocksdb/issues/11330 shows the flawed state of tracing MultiGet before my change, and based on the unit test, there was essentially no change in tracing behavior with https://github.com/facebook/rocksdb/issues/11301. This change is to leave that code and behavior better than I found it.
This change is not intended to change any production behaviors except when block cache tracing is active, though might improve general read path efficiency by disabling some related tracking when such tracing is disabled.
More detail on production code:
* Refactoring to consolidate the construction of BlockCacheTraceRecord, and other related functionality, in block-based table reader, though it's somewhat awkward to preserve an optimization to avoid copying Slices into temporary strings in BlockCacheLookupContext.
* Accurately track cache hits and misses (etc.) for each data block accessed by a MultiGet(). (Previously reported hits as misses.)
* Reduced repeated checking of `block_cache_tracer_` state (by creating lookup_context only when active) for efficiency and to reduce the risk of corner case bugs where tracing is enabled or disabled for different parts of a read op. (See a TODO below)
* Improved estimate calculation for num_keys_in_block (see code comment)
Possible follow-up:
* `XXX:` use_cache=true means double cache query? (possible double-query of block cache when allow_mmap_reads=true)
* `TODO:` need more than one lookup_context here to track individual filter and index partition hits and misses
* `TODO:` optimize more state checks of `block_cache_tracer_` down to `lookup_context != nullptr`
* Pre-existing `XXX:` There appear to be 'break' statements above that bypass this writing of the block cache trace record
* Expand test coverage (see below)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11339
Test Plan:
* Added a basic unit test for block cache tracing MultiGet, for now just covering one data block with two keys.
* Added HitMissCountingCache to independently verify that the actual block cache trace and expected block cache trace also agree with the actual number of cache hits / misses (nothing missing or mislabeled). For now only used with MultiGet test.
* Better testing of num_keys_in_block, for now just with MultiGet
* Misc improvements to table_test to improve clarity, such as making it clear that certain keys are auto-inserted at the start of every test.
Performance test:
Testing multireadrandom as in https://github.com/facebook/rocksdb/issues/11301, except averaging over distinct runs rather than [-X30] which doesn't seem to sufficiently reset after each run to work as an independent test run.
Base with revert of 11301: 3148926 ops/sec
Base: 3019146 ops/sec
New: 2999529 ops/sec
Possibly a tiny MultiGet CPU regression with this change. We are now always allocating an additional vector for the LookupContexts. I'm still contemplating options to try to correct the regression in https://github.com/facebook/rocksdb/issues/11301.
Testing readrandom:
Base with revert of 11301: 2311988
Base: 2281726
New: 2299722
Possibly a tiny Get CPU improvement with this change. We are now avoiding some unnecessary LookupContext population.
Reviewed By: akankshamahajan15
Differential Revision: D44557845
Pulled By: pdillinger
fbshipit-source-id: b841691799d2a48fb59cc8880dc7cbb1e107ae3d
2023-04-07 19:55:56 +00:00
|
|
|
EXPECT_EQ(access.get_id, expected_records[index].get_id);
|
|
|
|
// The well-populated cases
|
|
|
|
if (access.caller == TableReaderCaller::kUserGet ||
|
|
|
|
(access.caller == TableReaderCaller::kUserMultiGet &&
|
|
|
|
access.block_type == TraceType::kBlockTraceDataBlock)) {
|
2020-04-03 16:52:38 +00:00
|
|
|
EXPECT_EQ(access.referenced_key,
|
|
|
|
expected_records[index].referenced_key);
|
|
|
|
EXPECT_EQ(access.get_from_user_specified_snapshot,
|
|
|
|
expected_records[index].get_from_user_specified_snapshot);
|
|
|
|
if (access.block_type == TraceType::kBlockTraceDataBlock) {
|
|
|
|
EXPECT_GT(access.referenced_data_size, 0);
|
|
|
|
EXPECT_GT(access.num_keys_in_block, 0);
|
Refactor block cache tracing w/improved MultiGet (#11339)
Summary:
After https://github.com/facebook/rocksdb/issues/11301, I wasn't sure whether I had regressed block cache tracing with MultiGet. Demo PR https://github.com/facebook/rocksdb/issues/11330 shows the flawed state of tracing MultiGet before my change, and based on the unit test, there was essentially no change in tracing behavior with https://github.com/facebook/rocksdb/issues/11301. This change is to leave that code and behavior better than I found it.
This change is not intended to change any production behaviors except when block cache tracing is active, though might improve general read path efficiency by disabling some related tracking when such tracing is disabled.
More detail on production code:
* Refactoring to consolidate the construction of BlockCacheTraceRecord, and other related functionality, in block-based table reader, though it's somewhat awkward to preserve an optimization to avoid copying Slices into temporary strings in BlockCacheLookupContext.
* Accurately track cache hits and misses (etc.) for each data block accessed by a MultiGet(). (Previously reported hits as misses.)
* Reduced repeated checking of `block_cache_tracer_` state (by creating lookup_context only when active) for efficiency and to reduce the risk of corner case bugs where tracing is enabled or disabled for different parts of a read op. (See a TODO below)
* Improved estimate calculation for num_keys_in_block (see code comment)
Possible follow-up:
* `XXX:` use_cache=true means double cache query? (possible double-query of block cache when allow_mmap_reads=true)
* `TODO:` need more than one lookup_context here to track individual filter and index partition hits and misses
* `TODO:` optimize more state checks of `block_cache_tracer_` down to `lookup_context != nullptr`
* Pre-existing `XXX:` There appear to be 'break' statements above that bypass this writing of the block cache trace record
* Expand test coverage (see below)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11339
Test Plan:
* Added a basic unit test for block cache tracing MultiGet, for now just covering one data block with two keys.
* Added HitMissCountingCache to independently verify that the actual block cache trace and expected block cache trace also agree with the actual number of cache hits / misses (nothing missing or mislabeled). For now only used with MultiGet test.
* Better testing of num_keys_in_block, for now just with MultiGet
* Misc improvements to table_test to improve clarity, such as making it clear that certain keys are auto-inserted at the start of every test.
Performance test:
Testing multireadrandom as in https://github.com/facebook/rocksdb/issues/11301, except averaging over distinct runs rather than [-X30] which doesn't seem to sufficiently reset after each run to work as an independent test run.
Base with revert of 11301: 3148926 ops/sec
Base: 3019146 ops/sec
New: 2999529 ops/sec
Possibly a tiny MultiGet CPU regression with this change. We are now always allocating an additional vector for the LookupContexts. I'm still contemplating options to try to correct the regression in https://github.com/facebook/rocksdb/issues/11301.
Testing readrandom:
Base with revert of 11301: 2311988
Base: 2281726
New: 2299722
Possibly a tiny Get CPU improvement with this change. We are now avoiding some unnecessary LookupContext population.
Reviewed By: akankshamahajan15
Differential Revision: D44557845
Pulled By: pdillinger
fbshipit-source-id: b841691799d2a48fb59cc8880dc7cbb1e107ae3d
2023-04-07 19:55:56 +00:00
|
|
|
if (access.caller == TableReaderCaller::kUserMultiGet) {
|
|
|
|
// Test num_keys_in_block estimate, assuming default restart
|
|
|
|
// interval of 16 and just one interval.
|
|
|
|
// Rounding depends on get_id.
|
|
|
|
if (access.get_id & 1) {
|
|
|
|
EXPECT_EQ(access.num_keys_in_block, 9);
|
|
|
|
} else {
|
|
|
|
EXPECT_EQ(access.num_keys_in_block, 8);
|
|
|
|
}
|
|
|
|
}
|
2020-04-03 16:52:38 +00:00
|
|
|
EXPECT_EQ(access.referenced_key_exist_in_block,
|
|
|
|
expected_records[index].referenced_key_exist_in_block);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
EXPECT_EQ(access.referenced_key, "");
|
2022-10-21 19:15:35 +00:00
|
|
|
EXPECT_FALSE(access.get_from_user_specified_snapshot);
|
2020-04-03 16:52:38 +00:00
|
|
|
EXPECT_EQ(access.referenced_data_size, 0);
|
|
|
|
EXPECT_EQ(access.num_keys_in_block, 0);
|
2022-10-21 19:15:35 +00:00
|
|
|
EXPECT_FALSE(access.referenced_key_exist_in_block);
|
2020-04-03 16:52:38 +00:00
|
|
|
}
|
|
|
|
index++;
|
2019-07-17 20:02:00 +00:00
|
|
|
}
|
2020-04-03 16:52:38 +00:00
|
|
|
EXPECT_EQ(index, expected_records.size());
|
2019-07-17 20:02:00 +00:00
|
|
|
}
|
|
|
|
EXPECT_OK(env_->DeleteFile(trace_file_path_));
|
|
|
|
EXPECT_OK(env_->DeleteDir(test_path_));
|
|
|
|
}
|
|
|
|
|
2018-01-10 23:06:29 +00:00
|
|
|
protected:
|
|
|
|
uint64_t IndexUncompressedHelper(bool indexCompress);
|
Refactor block cache tracing w/improved MultiGet (#11339)
Summary:
After https://github.com/facebook/rocksdb/issues/11301, I wasn't sure whether I had regressed block cache tracing with MultiGet. Demo PR https://github.com/facebook/rocksdb/issues/11330 shows the flawed state of tracing MultiGet before my change, and based on the unit test, there was essentially no change in tracing behavior with https://github.com/facebook/rocksdb/issues/11301. This change is to leave that code and behavior better than I found it.
This change is not intended to change any production behaviors except when block cache tracing is active, though might improve general read path efficiency by disabling some related tracking when such tracing is disabled.
More detail on production code:
* Refactoring to consolidate the construction of BlockCacheTraceRecord, and other related functionality, in block-based table reader, though it's somewhat awkward to preserve an optimization to avoid copying Slices into temporary strings in BlockCacheLookupContext.
* Accurately track cache hits and misses (etc.) for each data block accessed by a MultiGet(). (Previously reported hits as misses.)
* Reduced repeated checking of `block_cache_tracer_` state (by creating lookup_context only when active) for efficiency and to reduce the risk of corner case bugs where tracing is enabled or disabled for different parts of a read op. (See a TODO below)
* Improved estimate calculation for num_keys_in_block (see code comment)
Possible follow-up:
* `XXX:` use_cache=true means double cache query? (possible double-query of block cache when allow_mmap_reads=true)
* `TODO:` need more than one lookup_context here to track individual filter and index partition hits and misses
* `TODO:` optimize more state checks of `block_cache_tracer_` down to `lookup_context != nullptr`
* Pre-existing `XXX:` There appear to be 'break' statements above that bypass this writing of the block cache trace record
* Expand test coverage (see below)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11339
Test Plan:
* Added a basic unit test for block cache tracing MultiGet, for now just covering one data block with two keys.
* Added HitMissCountingCache to independently verify that the actual block cache trace and expected block cache trace also agree with the actual number of cache hits / misses (nothing missing or mislabeled). For now only used with MultiGet test.
* Better testing of num_keys_in_block, for now just with MultiGet
* Misc improvements to table_test to improve clarity, such as making it clear that certain keys are auto-inserted at the start of every test.
Performance test:
Testing multireadrandom as in https://github.com/facebook/rocksdb/issues/11301, except averaging over distinct runs rather than [-X30] which doesn't seem to sufficiently reset after each run to work as an independent test run.
Base with revert of 11301: 3148926 ops/sec
Base: 3019146 ops/sec
New: 2999529 ops/sec
Possibly a tiny MultiGet CPU regression with this change. We are now always allocating an additional vector for the LookupContexts. I'm still contemplating options to try to correct the regression in https://github.com/facebook/rocksdb/issues/11301.
Testing readrandom:
Base with revert of 11301: 2311988
Base: 2281726
New: 2299722
Possibly a tiny Get CPU improvement with this change. We are now avoiding some unnecessary LookupContext population.
Reviewed By: akankshamahajan15
Differential Revision: D44557845
Pulled By: pdillinger
fbshipit-source-id: b841691799d2a48fb59cc8880dc7cbb1e107ae3d
2023-04-07 19:55:56 +00:00
|
|
|
const std::string auto_add_key1 = "aak01";
|
|
|
|
const std::string auto_add_key2 = "aak02";
|
2018-06-05 02:59:44 +00:00
|
|
|
|
|
|
|
private:
|
|
|
|
uint32_t format_;
|
2019-07-17 20:02:00 +00:00
|
|
|
Env* env_;
|
|
|
|
std::string trace_file_path_;
|
|
|
|
std::string test_path_;
|
2018-01-10 23:06:29 +00:00
|
|
|
};
|
2014-01-27 21:53:22 +00:00
|
|
|
class PlainTableTest : public TableTest {};
|
2015-03-17 21:08:00 +00:00
|
|
|
class TablePropertyTest : public testing::Test {};
|
2018-07-20 21:31:27 +00:00
|
|
|
class BBTTailPrefetchTest : public TableTest {};
|
2014-02-12 21:14:59 +00:00
|
|
|
|
2020-02-10 23:42:46 +00:00
|
|
|
// The helper class to test the file checksum
|
|
|
|
class FileChecksumTestHelper {
|
|
|
|
public:
|
|
|
|
FileChecksumTestHelper(bool convert_to_internal_key = false)
|
2022-10-25 18:50:38 +00:00
|
|
|
: convert_to_internal_key_(convert_to_internal_key) {}
|
2023-12-01 19:15:17 +00:00
|
|
|
~FileChecksumTestHelper() = default;
|
2020-02-10 23:42:46 +00:00
|
|
|
|
2022-06-22 22:45:21 +00:00
|
|
|
void CreateWritableFile() {
|
2020-06-04 22:32:29 +00:00
|
|
|
sink_ = new test::StringSink();
|
2021-01-04 23:59:52 +00:00
|
|
|
std::unique_ptr<FSWritableFile> holder(sink_);
|
|
|
|
file_writer_.reset(new WritableFileWriter(
|
|
|
|
std::move(holder), "" /* don't care */, FileOptions()));
|
2020-02-10 23:42:46 +00:00
|
|
|
}
|
|
|
|
|
2020-03-29 22:57:02 +00:00
|
|
|
void SetFileChecksumGenerator(FileChecksumGenerator* checksum_generator) {
|
2020-02-10 23:42:46 +00:00
|
|
|
if (file_writer_ != nullptr) {
|
2020-03-29 22:57:02 +00:00
|
|
|
file_writer_->TEST_SetFileChecksumGenerator(checksum_generator);
|
2020-03-30 21:08:55 +00:00
|
|
|
} else {
|
|
|
|
delete checksum_generator;
|
2020-02-10 23:42:46 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
WritableFileWriter* GetFileWriter() { return file_writer_.get(); }
|
|
|
|
|
|
|
|
Status ResetTableBuilder(std::unique_ptr<TableBuilder>&& builder) {
|
|
|
|
assert(builder != nullptr);
|
|
|
|
table_builder_ = std::move(builder);
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
void AddKVtoKVMap(int num_entries) {
|
|
|
|
Random rnd(test::RandomSeed());
|
|
|
|
for (int i = 0; i < num_entries; i++) {
|
2020-07-09 21:33:42 +00:00
|
|
|
std::string v = rnd.RandomString(100);
|
2020-02-10 23:42:46 +00:00
|
|
|
kv_map_[test::RandomKey(&rnd, 20)] = v;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Status WriteKVAndFlushTable() {
|
2020-06-24 23:20:55 +00:00
|
|
|
for (const auto& kv : kv_map_) {
|
2020-02-10 23:42:46 +00:00
|
|
|
if (convert_to_internal_key_) {
|
|
|
|
ParsedInternalKey ikey(kv.first, kMaxSequenceNumber, kTypeValue);
|
|
|
|
std::string encoded;
|
|
|
|
AppendInternalKey(&encoded, ikey);
|
|
|
|
table_builder_->Add(encoded, kv.second);
|
|
|
|
} else {
|
|
|
|
table_builder_->Add(kv.first, kv.second);
|
|
|
|
}
|
|
|
|
EXPECT_TRUE(table_builder_->status().ok());
|
|
|
|
}
|
|
|
|
Status s = table_builder_->Finish();
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
EXPECT_OK(file_writer_->Flush(IOOptions()));
|
2020-12-24 00:54:05 +00:00
|
|
|
EXPECT_OK(s);
|
2020-02-10 23:42:46 +00:00
|
|
|
|
|
|
|
EXPECT_EQ(sink_->contents().size(), table_builder_->FileSize());
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2020-03-29 22:57:02 +00:00
|
|
|
std::string GetFileChecksum() {
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
EXPECT_OK(file_writer_->Close(IOOptions()));
|
2020-03-29 22:57:02 +00:00
|
|
|
return table_builder_->GetFileChecksum();
|
|
|
|
}
|
2020-02-10 23:42:46 +00:00
|
|
|
|
|
|
|
const char* GetFileChecksumFuncName() {
|
|
|
|
return table_builder_->GetFileChecksumFuncName();
|
|
|
|
}
|
|
|
|
|
2020-03-29 22:57:02 +00:00
|
|
|
Status CalculateFileChecksum(FileChecksumGenerator* file_checksum_generator,
|
2020-02-10 23:42:46 +00:00
|
|
|
std::string* checksum) {
|
2020-03-29 22:57:02 +00:00
|
|
|
assert(file_checksum_generator != nullptr);
|
Always verify SST unique IDs on SST file open (#10532)
Summary:
Although we've been tracking SST unique IDs in the DB manifest
unconditionally, checking has been opt-in and with an extra pass at DB::Open
time. This changes the behavior of `verify_sst_unique_id_in_manifest` to
check unique ID against manifest every time an SST file is opened through
table cache (normal DB operations), replacing the explicit pass over files
at DB::Open time. This change also enables the option by default and
removes the "EXPERIMENTAL" designation.
One possible criticism is that the option no longer ensures the integrity
of a DB at Open time. This is far from an all-or-nothing issue. Verifying
the IDs of all SST files hardly ensures all the data in the DB is readable.
(VerifyChecksum is supposed to do that.) Also, with
max_open_files=-1 (default, extremely common), all SST files are
opened at DB::Open time anyway.
Implementation details:
* `VerifySstUniqueIdInManifest()` functions are the extra/explicit pass
that is now removed.
* Unit tests that manipulate/corrupt table properties have to opt out of
this check, because that corrupts the "actual" unique id. (And even for
testing we don't currently have a mechanism to set "no unique id"
in the in-memory file metadata for new files.)
* A lot of other unit test churn relates to (a) default checking on, and
(b) checking on SST open even without DB::Open (e.g. on flush)
* Use `FileMetaData` for more `TableCache` operations (in place of
`FileDescriptor`) so that we have access to the unique_id whenever
we might need to open an SST file. **There is the possibility of
performance impact because we can no longer use the more
localized `fd` part of an `FdWithKeyRange` but instead follow the
`file_metadata` pointer. However, this change (possible regression)
is only done for `GetMemoryUsageByTableReaders`.**
* Removed a completely unnecessary constructor overload of
`TableReaderOptions`
Possible follow-up:
* Verification only happens when opening through table cache. Are there
more places where this should happen?
* Improve error message when there is a file size mismatch vs. manifest
(FIXME added in the appropriate place).
* I'm not sure there's a justification for `FileDescriptor` to be distinct from
`FileMetaData`.
* I'm skeptical that `FdWithKeyRange` really still makes sense for
optimizing some data locality by duplicating some data in memory, but I
could be wrong.
* An unnecessary overload of NewTableReader was recently added, in
the public API nonetheless (though unusable there). It should be cleaned
up to put most things under `TableReaderOptions`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10532
Test Plan:
updated unit tests
Performance test showing no significant difference (just noise I think):
`./db_bench -benchmarks=readwhilewriting[-X10] -num=3000000 -disable_wal=1 -bloom_bits=8 -write_buffer_size=1000000 -target_file_size_base=1000000`
Before: readwhilewriting [AVG 10 runs] : 68702 (± 6932) ops/sec
After: readwhilewriting [AVG 10 runs] : 68239 (± 7198) ops/sec
Reviewed By: jay-zhuang
Differential Revision: D38765551
Pulled By: pdillinger
fbshipit-source-id: a827a708155f12344ab2a5c16e7701c7636da4c2
2022-09-08 05:52:42 +00:00
|
|
|
cur_file_num_ = checksum_file_num_++;
|
2020-02-10 23:42:46 +00:00
|
|
|
test::StringSink* ss_rw =
|
2021-01-04 23:59:52 +00:00
|
|
|
static_cast<test::StringSink*>(file_writer_->writable_file());
|
|
|
|
std::unique_ptr<FSRandomAccessFile> source(
|
|
|
|
new test::StringSource(ss_rw->contents()));
|
|
|
|
file_reader_.reset(new RandomAccessFileReader(std::move(source), "test"));
|
|
|
|
|
2020-02-10 23:42:46 +00:00
|
|
|
std::unique_ptr<char[]> scratch(new char[2048]);
|
|
|
|
Slice result;
|
|
|
|
uint64_t offset = 0;
|
|
|
|
Status s;
|
2020-04-30 21:48:51 +00:00
|
|
|
s = file_reader_->Read(IOOptions(), offset, 2048, &result, scratch.get(),
|
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444)
Summary:
**Context/Summary:**
- Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`.
- For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation)
- New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main.
- Misc
- More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority`
- Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444
Test Plan:
- CI fake db crash/stress test
- Microbenchmarking
**Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench`
- google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd
- db_basic_bench_base: upstream
- db_basic_bench_pr: db_basic_bench_base + this PR
- asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read)
- asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR
**Test**
Get
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000
```
Result
```
Coming soon
```
AsyncRead
```
TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out
```
Result
```
Base:
1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008,
PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before):
1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053,
```
Reviewed By: ajkr
Differential Revision: D45918925
Pulled By: hx235
fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
|
|
|
nullptr);
|
2020-02-10 23:42:46 +00:00
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
while (result.size() != 0) {
|
2020-03-29 22:57:02 +00:00
|
|
|
file_checksum_generator->Update(scratch.get(), result.size());
|
2020-02-10 23:42:46 +00:00
|
|
|
offset += static_cast<uint64_t>(result.size());
|
2020-04-30 21:48:51 +00:00
|
|
|
s = file_reader_->Read(IOOptions(), offset, 2048, &result, scratch.get(),
|
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444)
Summary:
**Context/Summary:**
- Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`.
- For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation)
- New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main.
- Misc
- More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority`
- Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444
Test Plan:
- CI fake db crash/stress test
- Microbenchmarking
**Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench`
- google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd
- db_basic_bench_base: upstream
- db_basic_bench_pr: db_basic_bench_base + this PR
- asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read)
- asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR
**Test**
Get
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000
```
Result
```
Coming soon
```
AsyncRead
```
TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out
```
Result
```
Base:
1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008,
PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before):
1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053,
```
Reviewed By: ajkr
Differential Revision: D45918925
Pulled By: hx235
fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
|
|
|
nullptr);
|
2020-02-10 23:42:46 +00:00
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
EXPECT_EQ(offset, static_cast<uint64_t>(table_builder_->FileSize()));
|
2020-03-29 22:57:02 +00:00
|
|
|
file_checksum_generator->Finalize();
|
|
|
|
*checksum = file_checksum_generator->GetChecksum();
|
2020-02-10 23:42:46 +00:00
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
bool convert_to_internal_key_;
|
Always verify SST unique IDs on SST file open (#10532)
Summary:
Although we've been tracking SST unique IDs in the DB manifest
unconditionally, checking has been opt-in and with an extra pass at DB::Open
time. This changes the behavior of `verify_sst_unique_id_in_manifest` to
check unique ID against manifest every time an SST file is opened through
table cache (normal DB operations), replacing the explicit pass over files
at DB::Open time. This change also enables the option by default and
removes the "EXPERIMENTAL" designation.
One possible criticism is that the option no longer ensures the integrity
of a DB at Open time. This is far from an all-or-nothing issue. Verifying
the IDs of all SST files hardly ensures all the data in the DB is readable.
(VerifyChecksum is supposed to do that.) Also, with
max_open_files=-1 (default, extremely common), all SST files are
opened at DB::Open time anyway.
Implementation details:
* `VerifySstUniqueIdInManifest()` functions are the extra/explicit pass
that is now removed.
* Unit tests that manipulate/corrupt table properties have to opt out of
this check, because that corrupts the "actual" unique id. (And even for
testing we don't currently have a mechanism to set "no unique id"
in the in-memory file metadata for new files.)
* A lot of other unit test churn relates to (a) default checking on, and
(b) checking on SST open even without DB::Open (e.g. on flush)
* Use `FileMetaData` for more `TableCache` operations (in place of
`FileDescriptor`) so that we have access to the unique_id whenever
we might need to open an SST file. **There is the possibility of
performance impact because we can no longer use the more
localized `fd` part of an `FdWithKeyRange` but instead follow the
`file_metadata` pointer. However, this change (possible regression)
is only done for `GetMemoryUsageByTableReaders`.**
* Removed a completely unnecessary constructor overload of
`TableReaderOptions`
Possible follow-up:
* Verification only happens when opening through table cache. Are there
more places where this should happen?
* Improve error message when there is a file size mismatch vs. manifest
(FIXME added in the appropriate place).
* I'm not sure there's a justification for `FileDescriptor` to be distinct from
`FileMetaData`.
* I'm skeptical that `FdWithKeyRange` really still makes sense for
optimizing some data locality by duplicating some data in memory, but I
could be wrong.
* An unnecessary overload of NewTableReader was recently added, in
the public API nonetheless (though unusable there). It should be cleaned
up to put most things under `TableReaderOptions`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10532
Test Plan:
updated unit tests
Performance test showing no significant difference (just noise I think):
`./db_bench -benchmarks=readwhilewriting[-X10] -num=3000000 -disable_wal=1 -bloom_bits=8 -write_buffer_size=1000000 -target_file_size_base=1000000`
Before: readwhilewriting [AVG 10 runs] : 68702 (± 6932) ops/sec
After: readwhilewriting [AVG 10 runs] : 68239 (± 7198) ops/sec
Reviewed By: jay-zhuang
Differential Revision: D38765551
Pulled By: pdillinger
fbshipit-source-id: a827a708155f12344ab2a5c16e7701c7636da4c2
2022-09-08 05:52:42 +00:00
|
|
|
uint64_t cur_file_num_;
|
2020-02-10 23:42:46 +00:00
|
|
|
std::unique_ptr<WritableFileWriter> file_writer_;
|
|
|
|
std::unique_ptr<RandomAccessFileReader> file_reader_;
|
|
|
|
std::unique_ptr<TableBuilder> table_builder_;
|
|
|
|
stl_wrappers::KVMap kv_map_;
|
2020-06-04 22:32:29 +00:00
|
|
|
test::StringSink* sink_ = nullptr;
|
2020-02-10 23:42:46 +00:00
|
|
|
|
Always verify SST unique IDs on SST file open (#10532)
Summary:
Although we've been tracking SST unique IDs in the DB manifest
unconditionally, checking has been opt-in and with an extra pass at DB::Open
time. This changes the behavior of `verify_sst_unique_id_in_manifest` to
check unique ID against manifest every time an SST file is opened through
table cache (normal DB operations), replacing the explicit pass over files
at DB::Open time. This change also enables the option by default and
removes the "EXPERIMENTAL" designation.
One possible criticism is that the option no longer ensures the integrity
of a DB at Open time. This is far from an all-or-nothing issue. Verifying
the IDs of all SST files hardly ensures all the data in the DB is readable.
(VerifyChecksum is supposed to do that.) Also, with
max_open_files=-1 (default, extremely common), all SST files are
opened at DB::Open time anyway.
Implementation details:
* `VerifySstUniqueIdInManifest()` functions are the extra/explicit pass
that is now removed.
* Unit tests that manipulate/corrupt table properties have to opt out of
this check, because that corrupts the "actual" unique id. (And even for
testing we don't currently have a mechanism to set "no unique id"
in the in-memory file metadata for new files.)
* A lot of other unit test churn relates to (a) default checking on, and
(b) checking on SST open even without DB::Open (e.g. on flush)
* Use `FileMetaData` for more `TableCache` operations (in place of
`FileDescriptor`) so that we have access to the unique_id whenever
we might need to open an SST file. **There is the possibility of
performance impact because we can no longer use the more
localized `fd` part of an `FdWithKeyRange` but instead follow the
`file_metadata` pointer. However, this change (possible regression)
is only done for `GetMemoryUsageByTableReaders`.**
* Removed a completely unnecessary constructor overload of
`TableReaderOptions`
Possible follow-up:
* Verification only happens when opening through table cache. Are there
more places where this should happen?
* Improve error message when there is a file size mismatch vs. manifest
(FIXME added in the appropriate place).
* I'm not sure there's a justification for `FileDescriptor` to be distinct from
`FileMetaData`.
* I'm skeptical that `FdWithKeyRange` really still makes sense for
optimizing some data locality by duplicating some data in memory, but I
could be wrong.
* An unnecessary overload of NewTableReader was recently added, in
the public API nonetheless (though unusable there). It should be cleaned
up to put most things under `TableReaderOptions`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10532
Test Plan:
updated unit tests
Performance test showing no significant difference (just noise I think):
`./db_bench -benchmarks=readwhilewriting[-X10] -num=3000000 -disable_wal=1 -bloom_bits=8 -write_buffer_size=1000000 -target_file_size_base=1000000`
Before: readwhilewriting [AVG 10 runs] : 68702 (± 6932) ops/sec
After: readwhilewriting [AVG 10 runs] : 68239 (± 7198) ops/sec
Reviewed By: jay-zhuang
Differential Revision: D38765551
Pulled By: pdillinger
fbshipit-source-id: a827a708155f12344ab2a5c16e7701c7636da4c2
2022-09-08 05:52:42 +00:00
|
|
|
static uint64_t checksum_file_num_;
|
2020-02-10 23:42:46 +00:00
|
|
|
};
|
|
|
|
|
Always verify SST unique IDs on SST file open (#10532)
Summary:
Although we've been tracking SST unique IDs in the DB manifest
unconditionally, checking has been opt-in and with an extra pass at DB::Open
time. This changes the behavior of `verify_sst_unique_id_in_manifest` to
check unique ID against manifest every time an SST file is opened through
table cache (normal DB operations), replacing the explicit pass over files
at DB::Open time. This change also enables the option by default and
removes the "EXPERIMENTAL" designation.
One possible criticism is that the option no longer ensures the integrity
of a DB at Open time. This is far from an all-or-nothing issue. Verifying
the IDs of all SST files hardly ensures all the data in the DB is readable.
(VerifyChecksum is supposed to do that.) Also, with
max_open_files=-1 (default, extremely common), all SST files are
opened at DB::Open time anyway.
Implementation details:
* `VerifySstUniqueIdInManifest()` functions are the extra/explicit pass
that is now removed.
* Unit tests that manipulate/corrupt table properties have to opt out of
this check, because that corrupts the "actual" unique id. (And even for
testing we don't currently have a mechanism to set "no unique id"
in the in-memory file metadata for new files.)
* A lot of other unit test churn relates to (a) default checking on, and
(b) checking on SST open even without DB::Open (e.g. on flush)
* Use `FileMetaData` for more `TableCache` operations (in place of
`FileDescriptor`) so that we have access to the unique_id whenever
we might need to open an SST file. **There is the possibility of
performance impact because we can no longer use the more
localized `fd` part of an `FdWithKeyRange` but instead follow the
`file_metadata` pointer. However, this change (possible regression)
is only done for `GetMemoryUsageByTableReaders`.**
* Removed a completely unnecessary constructor overload of
`TableReaderOptions`
Possible follow-up:
* Verification only happens when opening through table cache. Are there
more places where this should happen?
* Improve error message when there is a file size mismatch vs. manifest
(FIXME added in the appropriate place).
* I'm not sure there's a justification for `FileDescriptor` to be distinct from
`FileMetaData`.
* I'm skeptical that `FdWithKeyRange` really still makes sense for
optimizing some data locality by duplicating some data in memory, but I
could be wrong.
* An unnecessary overload of NewTableReader was recently added, in
the public API nonetheless (though unusable there). It should be cleaned
up to put most things under `TableReaderOptions`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10532
Test Plan:
updated unit tests
Performance test showing no significant difference (just noise I think):
`./db_bench -benchmarks=readwhilewriting[-X10] -num=3000000 -disable_wal=1 -bloom_bits=8 -write_buffer_size=1000000 -target_file_size_base=1000000`
Before: readwhilewriting [AVG 10 runs] : 68702 (± 6932) ops/sec
After: readwhilewriting [AVG 10 runs] : 68239 (± 7198) ops/sec
Reviewed By: jay-zhuang
Differential Revision: D38765551
Pulled By: pdillinger
fbshipit-source-id: a827a708155f12344ab2a5c16e7701c7636da4c2
2022-09-08 05:52:42 +00:00
|
|
|
uint64_t FileChecksumTestHelper::checksum_file_num_ = 1;
|
2020-02-10 23:42:46 +00:00
|
|
|
|
2021-12-10 16:12:09 +00:00
|
|
|
INSTANTIATE_TEST_CASE_P(FormatVersions, BlockBasedTableTest,
|
|
|
|
testing::ValuesIn(test::kFooterFormatVersionsToTest));
|
2018-06-05 02:59:44 +00:00
|
|
|
|
2014-02-12 21:14:59 +00:00
|
|
|
// This test serves as the living tutorial for the prefix scan of user collected
|
|
|
|
// properties.
|
2015-03-17 21:08:00 +00:00
|
|
|
TEST_F(TablePropertyTest, PrefixScanTest) {
|
2022-10-25 18:50:38 +00:00
|
|
|
UserCollectedProperties props{
|
|
|
|
{"num.111.1", "1"}, {"num.111.2", "2"}, {"num.111.3", "3"},
|
|
|
|
{"num.333.1", "1"}, {"num.333.2", "2"}, {"num.333.3", "3"},
|
|
|
|
{"num.555.1", "1"}, {"num.555.2", "2"}, {"num.555.3", "3"},
|
|
|
|
};
|
2014-02-12 21:14:59 +00:00
|
|
|
|
|
|
|
// prefixes that exist
|
2021-10-18 19:21:25 +00:00
|
|
|
for (const std::string prefix : {"num.111", "num.333", "num.555"}) {
|
2014-02-12 21:14:59 +00:00
|
|
|
int num = 0;
|
|
|
|
for (auto pos = props.lower_bound(prefix);
|
|
|
|
pos != props.end() &&
|
2022-10-25 18:50:38 +00:00
|
|
|
pos->first.compare(0, prefix.size(), prefix) == 0;
|
2014-02-12 21:14:59 +00:00
|
|
|
++pos) {
|
|
|
|
++num;
|
2022-05-06 20:03:58 +00:00
|
|
|
auto key = prefix + "." + std::to_string(num);
|
2014-02-12 21:14:59 +00:00
|
|
|
ASSERT_EQ(key, pos->first);
|
2022-05-06 20:03:58 +00:00
|
|
|
ASSERT_EQ(std::to_string(num), pos->second);
|
2014-02-12 21:14:59 +00:00
|
|
|
}
|
|
|
|
ASSERT_EQ(3, num);
|
|
|
|
}
|
|
|
|
|
|
|
|
// prefixes that don't exist
|
2021-10-18 19:21:25 +00:00
|
|
|
for (const std::string prefix :
|
2014-02-12 21:14:59 +00:00
|
|
|
{"num.000", "num.222", "num.444", "num.666"}) {
|
|
|
|
auto pos = props.lower_bound(prefix);
|
|
|
|
ASSERT_TRUE(pos == props.end() ||
|
|
|
|
pos->first.compare(0, prefix.size(), prefix) != 0);
|
|
|
|
}
|
|
|
|
}
|
2011-03-18 22:37:00 +00:00
|
|
|
|
Experimental support for SST unique IDs (#8990)
Summary:
* New public header unique_id.h and function GetUniqueIdFromTableProperties
which computes a universally unique identifier based on table properties
of table files from recent RocksDB versions.
* Generation of DB session IDs is refactored so that they are
guaranteed unique in the lifetime of a process running RocksDB.
(SemiStructuredUniqueIdGen, new test included.) Along with file numbers,
this enables SST unique IDs to be guaranteed unique among SSTs generated
in a single process, and "better than random" between processes.
See https://github.com/pdillinger/unique_id
* In addition to public API producing 'external' unique IDs, there is a function
for producing 'internal' unique IDs, with functions for converting between the
two. In short, the external ID is "safe" for things people might do with it, and
the internal ID enables more "power user" features for the future. Specifically,
the external ID goes through a hashing layer so that any subset of bits in the
external ID can be used as a hash of the full ID, while also preserving
uniqueness guarantees in the first 128 bits (bijective both on first 128 bits
and on full 192 bits).
Intended follow-up:
* Use the internal unique IDs in cache keys. (Avoid conflicts with https://github.com/facebook/rocksdb/issues/8912) (The file offset can be XORed into
the third 64-bit value of the unique ID.)
* Publish the external unique IDs in FileStorageInfo (https://github.com/facebook/rocksdb/issues/8968)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8990
Test Plan:
Unit tests added, and checking of unique ids in stress test.
NOTE in stress test we do not generate nearly enough files to thoroughly
stress uniqueness, but the test trims off pieces of the ID to check for
uniqueness so that we can infer (with some assumptions) stronger
properties in the aggregate.
Reviewed By: zhichao-cao, mrambacher
Differential Revision: D31582865
Pulled By: pdillinger
fbshipit-source-id: 1f620c4c86af9abe2a8d177b9ccf2ad2b9f48243
2021-10-19 06:28:28 +00:00
|
|
|
namespace {
|
|
|
|
struct TestIds {
|
|
|
|
UniqueId64x3 internal_id;
|
|
|
|
UniqueId64x3 external_id;
|
|
|
|
};
|
|
|
|
|
|
|
|
inline bool operator==(const TestIds& lhs, const TestIds& rhs) {
|
|
|
|
return lhs.internal_id == rhs.internal_id &&
|
|
|
|
lhs.external_id == rhs.external_id;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::ostream& operator<<(std::ostream& os, const TestIds& ids) {
|
|
|
|
return os << std::hex << "{{{ 0x" << ids.internal_id[0] << "U, 0x"
|
|
|
|
<< ids.internal_id[1] << "U, 0x" << ids.internal_id[2]
|
|
|
|
<< "U }}, {{ 0x" << ids.external_id[0] << "U, 0x"
|
|
|
|
<< ids.external_id[1] << "U, 0x" << ids.external_id[2] << "U }}}";
|
|
|
|
}
|
|
|
|
|
|
|
|
TestIds GetUniqueId(TableProperties* tp, std::unordered_set<uint64_t>* seen,
|
|
|
|
const std::string& db_id, const std::string& db_session_id,
|
|
|
|
uint64_t file_number) {
|
|
|
|
// First test session id logic
|
|
|
|
if (db_session_id.size() == 20) {
|
|
|
|
uint64_t upper;
|
|
|
|
uint64_t lower;
|
|
|
|
EXPECT_OK(DecodeSessionId(db_session_id, &upper, &lower));
|
|
|
|
EXPECT_EQ(EncodeSessionId(upper, lower), db_session_id);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Get external using public API
|
|
|
|
tp->db_id = db_id;
|
|
|
|
tp->db_session_id = db_session_id;
|
|
|
|
tp->orig_file_number = file_number;
|
|
|
|
TestIds t;
|
|
|
|
{
|
2022-05-18 01:43:48 +00:00
|
|
|
std::string euid;
|
|
|
|
EXPECT_OK(GetExtendedUniqueIdFromTableProperties(*tp, &euid));
|
|
|
|
EXPECT_EQ(euid.size(), 24U);
|
2023-12-01 19:15:17 +00:00
|
|
|
t.external_id[0] = DecodeFixed64(euid.data());
|
2022-05-18 01:43:48 +00:00
|
|
|
t.external_id[1] = DecodeFixed64(&euid[8]);
|
|
|
|
t.external_id[2] = DecodeFixed64(&euid[16]);
|
|
|
|
|
Experimental support for SST unique IDs (#8990)
Summary:
* New public header unique_id.h and function GetUniqueIdFromTableProperties
which computes a universally unique identifier based on table properties
of table files from recent RocksDB versions.
* Generation of DB session IDs is refactored so that they are
guaranteed unique in the lifetime of a process running RocksDB.
(SemiStructuredUniqueIdGen, new test included.) Along with file numbers,
this enables SST unique IDs to be guaranteed unique among SSTs generated
in a single process, and "better than random" between processes.
See https://github.com/pdillinger/unique_id
* In addition to public API producing 'external' unique IDs, there is a function
for producing 'internal' unique IDs, with functions for converting between the
two. In short, the external ID is "safe" for things people might do with it, and
the internal ID enables more "power user" features for the future. Specifically,
the external ID goes through a hashing layer so that any subset of bits in the
external ID can be used as a hash of the full ID, while also preserving
uniqueness guarantees in the first 128 bits (bijective both on first 128 bits
and on full 192 bits).
Intended follow-up:
* Use the internal unique IDs in cache keys. (Avoid conflicts with https://github.com/facebook/rocksdb/issues/8912) (The file offset can be XORed into
the third 64-bit value of the unique ID.)
* Publish the external unique IDs in FileStorageInfo (https://github.com/facebook/rocksdb/issues/8968)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8990
Test Plan:
Unit tests added, and checking of unique ids in stress test.
NOTE in stress test we do not generate nearly enough files to thoroughly
stress uniqueness, but the test trims off pieces of the ID to check for
uniqueness so that we can infer (with some assumptions) stronger
properties in the aggregate.
Reviewed By: zhichao-cao, mrambacher
Differential Revision: D31582865
Pulled By: pdillinger
fbshipit-source-id: 1f620c4c86af9abe2a8d177b9ccf2ad2b9f48243
2021-10-19 06:28:28 +00:00
|
|
|
std::string uid;
|
|
|
|
EXPECT_OK(GetUniqueIdFromTableProperties(*tp, &uid));
|
2022-05-18 01:43:48 +00:00
|
|
|
EXPECT_EQ(uid.size(), 16U);
|
|
|
|
EXPECT_EQ(uid, euid.substr(0, 16));
|
2023-12-01 19:15:17 +00:00
|
|
|
EXPECT_EQ(t.external_id[0], DecodeFixed64(uid.data()));
|
2022-05-18 01:43:48 +00:00
|
|
|
EXPECT_EQ(t.external_id[1], DecodeFixed64(&uid[8]));
|
Experimental support for SST unique IDs (#8990)
Summary:
* New public header unique_id.h and function GetUniqueIdFromTableProperties
which computes a universally unique identifier based on table properties
of table files from recent RocksDB versions.
* Generation of DB session IDs is refactored so that they are
guaranteed unique in the lifetime of a process running RocksDB.
(SemiStructuredUniqueIdGen, new test included.) Along with file numbers,
this enables SST unique IDs to be guaranteed unique among SSTs generated
in a single process, and "better than random" between processes.
See https://github.com/pdillinger/unique_id
* In addition to public API producing 'external' unique IDs, there is a function
for producing 'internal' unique IDs, with functions for converting between the
two. In short, the external ID is "safe" for things people might do with it, and
the internal ID enables more "power user" features for the future. Specifically,
the external ID goes through a hashing layer so that any subset of bits in the
external ID can be used as a hash of the full ID, while also preserving
uniqueness guarantees in the first 128 bits (bijective both on first 128 bits
and on full 192 bits).
Intended follow-up:
* Use the internal unique IDs in cache keys. (Avoid conflicts with https://github.com/facebook/rocksdb/issues/8912) (The file offset can be XORed into
the third 64-bit value of the unique ID.)
* Publish the external unique IDs in FileStorageInfo (https://github.com/facebook/rocksdb/issues/8968)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8990
Test Plan:
Unit tests added, and checking of unique ids in stress test.
NOTE in stress test we do not generate nearly enough files to thoroughly
stress uniqueness, but the test trims off pieces of the ID to check for
uniqueness so that we can infer (with some assumptions) stronger
properties in the aggregate.
Reviewed By: zhichao-cao, mrambacher
Differential Revision: D31582865
Pulled By: pdillinger
fbshipit-source-id: 1f620c4c86af9abe2a8d177b9ccf2ad2b9f48243
2021-10-19 06:28:28 +00:00
|
|
|
}
|
|
|
|
// All these should be effectively random
|
|
|
|
EXPECT_TRUE(seen->insert(t.external_id[0]).second);
|
|
|
|
EXPECT_TRUE(seen->insert(t.external_id[1]).second);
|
|
|
|
EXPECT_TRUE(seen->insert(t.external_id[2]).second);
|
|
|
|
|
|
|
|
// Get internal with internal API
|
|
|
|
EXPECT_OK(GetSstInternalUniqueId(db_id, db_session_id, file_number,
|
|
|
|
&t.internal_id));
|
2022-05-18 01:43:48 +00:00
|
|
|
EXPECT_NE(t.internal_id, kNullUniqueId64x3);
|
Experimental support for SST unique IDs (#8990)
Summary:
* New public header unique_id.h and function GetUniqueIdFromTableProperties
which computes a universally unique identifier based on table properties
of table files from recent RocksDB versions.
* Generation of DB session IDs is refactored so that they are
guaranteed unique in the lifetime of a process running RocksDB.
(SemiStructuredUniqueIdGen, new test included.) Along with file numbers,
this enables SST unique IDs to be guaranteed unique among SSTs generated
in a single process, and "better than random" between processes.
See https://github.com/pdillinger/unique_id
* In addition to public API producing 'external' unique IDs, there is a function
for producing 'internal' unique IDs, with functions for converting between the
two. In short, the external ID is "safe" for things people might do with it, and
the internal ID enables more "power user" features for the future. Specifically,
the external ID goes through a hashing layer so that any subset of bits in the
external ID can be used as a hash of the full ID, while also preserving
uniqueness guarantees in the first 128 bits (bijective both on first 128 bits
and on full 192 bits).
Intended follow-up:
* Use the internal unique IDs in cache keys. (Avoid conflicts with https://github.com/facebook/rocksdb/issues/8912) (The file offset can be XORed into
the third 64-bit value of the unique ID.)
* Publish the external unique IDs in FileStorageInfo (https://github.com/facebook/rocksdb/issues/8968)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8990
Test Plan:
Unit tests added, and checking of unique ids in stress test.
NOTE in stress test we do not generate nearly enough files to thoroughly
stress uniqueness, but the test trims off pieces of the ID to check for
uniqueness so that we can infer (with some assumptions) stronger
properties in the aggregate.
Reviewed By: zhichao-cao, mrambacher
Differential Revision: D31582865
Pulled By: pdillinger
fbshipit-source-id: 1f620c4c86af9abe2a8d177b9ccf2ad2b9f48243
2021-10-19 06:28:28 +00:00
|
|
|
|
|
|
|
// Verify relationship
|
|
|
|
UniqueId64x3 tmp = t.internal_id;
|
|
|
|
InternalUniqueIdToExternal(&tmp);
|
|
|
|
EXPECT_EQ(tmp, t.external_id);
|
|
|
|
ExternalUniqueIdToInternal(&tmp);
|
|
|
|
EXPECT_EQ(tmp, t.internal_id);
|
2022-05-18 01:43:48 +00:00
|
|
|
|
|
|
|
// And 128-bit internal version
|
|
|
|
UniqueId64x2 tmp2{};
|
|
|
|
EXPECT_OK(GetSstInternalUniqueId(db_id, db_session_id, file_number, &tmp2));
|
|
|
|
EXPECT_NE(tmp2, kNullUniqueId64x2);
|
|
|
|
|
|
|
|
EXPECT_EQ(tmp2[0], t.internal_id[0]);
|
|
|
|
EXPECT_EQ(tmp2[1], t.internal_id[1]);
|
|
|
|
InternalUniqueIdToExternal(&tmp2);
|
|
|
|
EXPECT_EQ(tmp2[0], t.external_id[0]);
|
|
|
|
EXPECT_EQ(tmp2[1], t.external_id[1]);
|
|
|
|
ExternalUniqueIdToInternal(&tmp2);
|
|
|
|
EXPECT_EQ(tmp2[0], t.internal_id[0]);
|
|
|
|
EXPECT_EQ(tmp2[1], t.internal_id[1]);
|
|
|
|
|
Experimental support for SST unique IDs (#8990)
Summary:
* New public header unique_id.h and function GetUniqueIdFromTableProperties
which computes a universally unique identifier based on table properties
of table files from recent RocksDB versions.
* Generation of DB session IDs is refactored so that they are
guaranteed unique in the lifetime of a process running RocksDB.
(SemiStructuredUniqueIdGen, new test included.) Along with file numbers,
this enables SST unique IDs to be guaranteed unique among SSTs generated
in a single process, and "better than random" between processes.
See https://github.com/pdillinger/unique_id
* In addition to public API producing 'external' unique IDs, there is a function
for producing 'internal' unique IDs, with functions for converting between the
two. In short, the external ID is "safe" for things people might do with it, and
the internal ID enables more "power user" features for the future. Specifically,
the external ID goes through a hashing layer so that any subset of bits in the
external ID can be used as a hash of the full ID, while also preserving
uniqueness guarantees in the first 128 bits (bijective both on first 128 bits
and on full 192 bits).
Intended follow-up:
* Use the internal unique IDs in cache keys. (Avoid conflicts with https://github.com/facebook/rocksdb/issues/8912) (The file offset can be XORed into
the third 64-bit value of the unique ID.)
* Publish the external unique IDs in FileStorageInfo (https://github.com/facebook/rocksdb/issues/8968)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8990
Test Plan:
Unit tests added, and checking of unique ids in stress test.
NOTE in stress test we do not generate nearly enough files to thoroughly
stress uniqueness, but the test trims off pieces of the ID to check for
uniqueness so that we can infer (with some assumptions) stronger
properties in the aggregate.
Reviewed By: zhichao-cao, mrambacher
Differential Revision: D31582865
Pulled By: pdillinger
fbshipit-source-id: 1f620c4c86af9abe2a8d177b9ccf2ad2b9f48243
2021-10-19 06:28:28 +00:00
|
|
|
return t;
|
|
|
|
}
|
|
|
|
} // namespace
|
|
|
|
|
|
|
|
TEST_F(TablePropertyTest, UniqueIdsSchemaAndQuality) {
|
|
|
|
// To ensure the computation only depends on the expected entries, we set
|
|
|
|
// the rest randomly
|
|
|
|
TableProperties tp;
|
|
|
|
TEST_SetRandomTableProperties(&tp);
|
|
|
|
|
|
|
|
// DB id is normally RFC-4122
|
|
|
|
const std::string db_id1 = "7265b6eb-4e42-4aec-86a4-0dc5e73a228d";
|
|
|
|
// Allow other forms of DB id
|
|
|
|
const std::string db_id2 = "1728000184588763620";
|
|
|
|
const std::string db_id3 = "x";
|
|
|
|
|
|
|
|
// DB session id is normally 20 chars in base-36, but 13 to 24 chars
|
|
|
|
// is ok, roughly 64 to 128 bits.
|
|
|
|
const std::string ses_id1 = "ABCDEFGHIJ0123456789";
|
|
|
|
// Same trailing 13 digits
|
|
|
|
const std::string ses_id2 = "HIJ0123456789";
|
|
|
|
const std::string ses_id3 = "0123ABCDEFGHIJ0123456789";
|
|
|
|
// Different trailing 12 digits
|
|
|
|
const std::string ses_id4 = "ABCDEFGH888888888888";
|
|
|
|
// And change length
|
|
|
|
const std::string ses_id5 = "ABCDEFGHIJ012";
|
|
|
|
const std::string ses_id6 = "ABCDEFGHIJ0123456789ABCD";
|
|
|
|
|
|
|
|
using T = TestIds;
|
|
|
|
std::unordered_set<uint64_t> seen;
|
|
|
|
// Establish a stable schema for the unique IDs. These values must not
|
|
|
|
// change for existing table files.
|
|
|
|
// (Note: parens needed for macro parsing, extra braces needed for some
|
|
|
|
// compilers.)
|
|
|
|
EXPECT_EQ(
|
|
|
|
GetUniqueId(&tp, &seen, db_id1, ses_id1, 1),
|
|
|
|
T({{{0x61d7dcf415d9cf19U, 0x160d77aae90757fdU, 0x907f41dfd90724ffU}},
|
|
|
|
{{0xf0bd230365df7464U, 0xca089303f3648eb4U, 0x4b44f7e7324b2817U}}}));
|
|
|
|
// Only change internal_id[1] with file number
|
|
|
|
EXPECT_EQ(
|
|
|
|
GetUniqueId(&tp, &seen, db_id1, ses_id1, 2),
|
|
|
|
T({{{0x61d7dcf415d9cf19U, 0x160d77aae90757feU, 0x907f41dfd90724ffU}},
|
|
|
|
{{0xf13fdf7adcfebb6dU, 0x97cd2226cc033ea2U, 0x198c438182091f0eU}}}));
|
|
|
|
EXPECT_EQ(
|
|
|
|
GetUniqueId(&tp, &seen, db_id1, ses_id1, 123456789),
|
|
|
|
T({{{0x61d7dcf415d9cf19U, 0x160d77aaee5c9ae9U, 0x907f41dfd90724ffU}},
|
|
|
|
{{0x81fbcebe1ac6c4f0U, 0x6b14a64cfdc0f1c4U, 0x7d8fb6eaf18edbb3U}}}));
|
|
|
|
// Change internal_id[1] and internal_id[2] with db_id
|
|
|
|
EXPECT_EQ(
|
|
|
|
GetUniqueId(&tp, &seen, db_id2, ses_id1, 1),
|
|
|
|
T({{{0x61d7dcf415d9cf19U, 0xf89c471f572f0d25U, 0x1f0f2a5eb0e6257eU}},
|
|
|
|
{{0x7f1d01d453616991U, 0x32ddf2afec804ab2U, 0xd10a1ee2f0c7d9c1U}}}));
|
|
|
|
EXPECT_EQ(
|
|
|
|
GetUniqueId(&tp, &seen, db_id3, ses_id1, 1),
|
|
|
|
T({{{0x61d7dcf415d9cf19U, 0xfed297a8154a57d0U, 0x8b931b9cdebd9e8U}},
|
|
|
|
{{0x62b2f43183f6894bU, 0x897ff2b460eefad1U, 0xf4ec189fb2d15e04U}}}));
|
|
|
|
// Keeping same last 13 digits of ses_id keeps same internal_id[0]
|
|
|
|
EXPECT_EQ(
|
|
|
|
GetUniqueId(&tp, &seen, db_id1, ses_id2, 1),
|
|
|
|
T({{{0x61d7dcf415d9cf19U, 0x5f6cc4fa2d528c8U, 0x7b70845d5bfb5446U}},
|
|
|
|
{{0x96d1c83ffcc94266U, 0x82663eac0ec6e14aU, 0x94a88b49678b77f6U}}}));
|
|
|
|
EXPECT_EQ(
|
|
|
|
GetUniqueId(&tp, &seen, db_id1, ses_id3, 1),
|
|
|
|
T({{{0x61d7dcf415d9cf19U, 0xfc7232879db37ea2U, 0xc0378d74ea4c89cdU}},
|
|
|
|
{{0xdf2ef57e98776905U, 0xda5b31c987da833bU, 0x79c1b4bd0a9e760dU}}}));
|
|
|
|
// Changing last 12 digits of ses_id only changes internal_id[0]
|
|
|
|
// (vs. db_id1, ses_id1, 1)
|
|
|
|
EXPECT_EQ(
|
|
|
|
GetUniqueId(&tp, &seen, db_id1, ses_id4, 1),
|
|
|
|
T({{{0x4f07cc0d003a83a8U, 0x160d77aae90757fdU, 0x907f41dfd90724ffU}},
|
|
|
|
{{0xbcf85336a9f71f04U, 0x4f2949e2f3adb60dU, 0x9ca0def976abfa10U}}}));
|
|
|
|
// ses_id can change everything.
|
|
|
|
EXPECT_EQ(
|
|
|
|
GetUniqueId(&tp, &seen, db_id1, ses_id5, 1),
|
|
|
|
T({{{0x94b8768e43f87ce6U, 0xc2559653ac4e7c93U, 0xde6dff6bbb1223U}},
|
|
|
|
{{0x5a9537af681817fbU, 0x1afcd1fecaead5eaU, 0x767077ad9ebe0008U}}}));
|
|
|
|
EXPECT_EQ(
|
|
|
|
GetUniqueId(&tp, &seen, db_id1, ses_id6, 1),
|
|
|
|
T({{{0x43cfb0ffa3b710edU, 0x263c580426406a1bU, 0xfacc91379a80d29dU}},
|
|
|
|
{{0xfa90547d84cb1cdbU, 0x2afe99c641992d4aU, 0x205b7f7b60e51cc2U}}}));
|
|
|
|
|
|
|
|
// Now verify more thoroughly that any small change in inputs completely
|
|
|
|
// changes external unique id.
|
|
|
|
// (Relying on 'seen' checks etc. in GetUniqueId)
|
|
|
|
std::string db_id = "00000000-0000-0000-0000-000000000000";
|
|
|
|
std::string ses_id = "000000000000000000000000";
|
|
|
|
uint64_t file_num = 1;
|
|
|
|
// change db_id
|
|
|
|
for (size_t i = 0; i < db_id.size(); ++i) {
|
|
|
|
if (db_id[i] == '-') {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
for (char alt : std::string("123456789abcdef")) {
|
|
|
|
db_id[i] = alt;
|
|
|
|
GetUniqueId(&tp, &seen, db_id, ses_id, file_num);
|
|
|
|
}
|
|
|
|
db_id[i] = '0';
|
|
|
|
}
|
|
|
|
// change ses_id
|
|
|
|
for (size_t i = 0; i < ses_id.size(); ++i) {
|
|
|
|
for (char alt : std::string("123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ")) {
|
|
|
|
ses_id[i] = alt;
|
|
|
|
GetUniqueId(&tp, &seen, db_id, ses_id, file_num);
|
|
|
|
}
|
|
|
|
ses_id[i] = '0';
|
|
|
|
}
|
|
|
|
// change file_num
|
|
|
|
for (int i = 1; i < 64; ++i) {
|
|
|
|
GetUniqueId(&tp, &seen, db_id, ses_id, file_num << i);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Verify that "all zeros" in first 128 bits is equivalent for internal and
|
|
|
|
// external IDs. This way, as long as we avoid "all zeros" in internal IDs,
|
|
|
|
// we avoid it in external IDs.
|
|
|
|
{
|
|
|
|
UniqueId64x3 id1{{0, 0, Random::GetTLSInstance()->Next64()}};
|
|
|
|
UniqueId64x3 id2 = id1;
|
|
|
|
InternalUniqueIdToExternal(&id1);
|
|
|
|
EXPECT_EQ(id1, id2);
|
|
|
|
ExternalUniqueIdToInternal(&id2);
|
|
|
|
EXPECT_EQ(id1, id2);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
void SetGoodTableProperties(TableProperties* tp) {
|
|
|
|
// To ensure the computation only depends on the expected entries, we set
|
|
|
|
// the rest randomly
|
|
|
|
TEST_SetRandomTableProperties(tp);
|
|
|
|
tp->db_id = "7265b6eb-4e42-4aec-86a4-0dc5e73a228d";
|
|
|
|
tp->db_session_id = "ABCDEFGHIJ0123456789";
|
|
|
|
tp->orig_file_number = 1;
|
|
|
|
}
|
|
|
|
} // namespace
|
|
|
|
|
|
|
|
TEST_F(TablePropertyTest, UniqueIdHumanStrings) {
|
|
|
|
TableProperties tp;
|
|
|
|
SetGoodTableProperties(&tp);
|
|
|
|
|
|
|
|
std::string tmp;
|
2022-05-18 01:43:48 +00:00
|
|
|
EXPECT_OK(GetExtendedUniqueIdFromTableProperties(tp, &tmp));
|
Experimental support for SST unique IDs (#8990)
Summary:
* New public header unique_id.h and function GetUniqueIdFromTableProperties
which computes a universally unique identifier based on table properties
of table files from recent RocksDB versions.
* Generation of DB session IDs is refactored so that they are
guaranteed unique in the lifetime of a process running RocksDB.
(SemiStructuredUniqueIdGen, new test included.) Along with file numbers,
this enables SST unique IDs to be guaranteed unique among SSTs generated
in a single process, and "better than random" between processes.
See https://github.com/pdillinger/unique_id
* In addition to public API producing 'external' unique IDs, there is a function
for producing 'internal' unique IDs, with functions for converting between the
two. In short, the external ID is "safe" for things people might do with it, and
the internal ID enables more "power user" features for the future. Specifically,
the external ID goes through a hashing layer so that any subset of bits in the
external ID can be used as a hash of the full ID, while also preserving
uniqueness guarantees in the first 128 bits (bijective both on first 128 bits
and on full 192 bits).
Intended follow-up:
* Use the internal unique IDs in cache keys. (Avoid conflicts with https://github.com/facebook/rocksdb/issues/8912) (The file offset can be XORed into
the third 64-bit value of the unique ID.)
* Publish the external unique IDs in FileStorageInfo (https://github.com/facebook/rocksdb/issues/8968)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8990
Test Plan:
Unit tests added, and checking of unique ids in stress test.
NOTE in stress test we do not generate nearly enough files to thoroughly
stress uniqueness, but the test trims off pieces of the ID to check for
uniqueness so that we can infer (with some assumptions) stronger
properties in the aggregate.
Reviewed By: zhichao-cao, mrambacher
Differential Revision: D31582865
Pulled By: pdillinger
fbshipit-source-id: 1f620c4c86af9abe2a8d177b9ccf2ad2b9f48243
2021-10-19 06:28:28 +00:00
|
|
|
EXPECT_EQ(tmp,
|
|
|
|
(std::string{{'\x64', '\x74', '\xdf', '\x65', '\x03', '\x23',
|
|
|
|
'\xbd', '\xf0', '\xb4', '\x8e', '\x64', '\xf3',
|
|
|
|
'\x03', '\x93', '\x08', '\xca', '\x17', '\x28',
|
|
|
|
'\x4b', '\x32', '\xe7', '\xf7', '\x44', '\x4b'}}));
|
|
|
|
EXPECT_EQ(UniqueIdToHumanString(tmp),
|
|
|
|
"6474DF650323BDF0-B48E64F3039308CA-17284B32E7F7444B");
|
|
|
|
|
2022-05-18 01:43:48 +00:00
|
|
|
EXPECT_OK(GetUniqueIdFromTableProperties(tp, &tmp));
|
|
|
|
EXPECT_EQ(UniqueIdToHumanString(tmp), "6474DF650323BDF0-B48E64F3039308CA");
|
|
|
|
|
Experimental support for SST unique IDs (#8990)
Summary:
* New public header unique_id.h and function GetUniqueIdFromTableProperties
which computes a universally unique identifier based on table properties
of table files from recent RocksDB versions.
* Generation of DB session IDs is refactored so that they are
guaranteed unique in the lifetime of a process running RocksDB.
(SemiStructuredUniqueIdGen, new test included.) Along with file numbers,
this enables SST unique IDs to be guaranteed unique among SSTs generated
in a single process, and "better than random" between processes.
See https://github.com/pdillinger/unique_id
* In addition to public API producing 'external' unique IDs, there is a function
for producing 'internal' unique IDs, with functions for converting between the
two. In short, the external ID is "safe" for things people might do with it, and
the internal ID enables more "power user" features for the future. Specifically,
the external ID goes through a hashing layer so that any subset of bits in the
external ID can be used as a hash of the full ID, while also preserving
uniqueness guarantees in the first 128 bits (bijective both on first 128 bits
and on full 192 bits).
Intended follow-up:
* Use the internal unique IDs in cache keys. (Avoid conflicts with https://github.com/facebook/rocksdb/issues/8912) (The file offset can be XORed into
the third 64-bit value of the unique ID.)
* Publish the external unique IDs in FileStorageInfo (https://github.com/facebook/rocksdb/issues/8968)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8990
Test Plan:
Unit tests added, and checking of unique ids in stress test.
NOTE in stress test we do not generate nearly enough files to thoroughly
stress uniqueness, but the test trims off pieces of the ID to check for
uniqueness so that we can infer (with some assumptions) stronger
properties in the aggregate.
Reviewed By: zhichao-cao, mrambacher
Differential Revision: D31582865
Pulled By: pdillinger
fbshipit-source-id: 1f620c4c86af9abe2a8d177b9ccf2ad2b9f48243
2021-10-19 06:28:28 +00:00
|
|
|
// including zero padding
|
|
|
|
tmp = std::string(24U, '\0');
|
|
|
|
tmp[15] = '\x12';
|
|
|
|
tmp[23] = '\xAB';
|
|
|
|
EXPECT_EQ(UniqueIdToHumanString(tmp),
|
|
|
|
"0000000000000000-0000000000000012-00000000000000AB");
|
|
|
|
|
|
|
|
// And shortened
|
|
|
|
tmp = std::string(20U, '\0');
|
|
|
|
tmp[5] = '\x12';
|
|
|
|
tmp[10] = '\xAB';
|
|
|
|
tmp[17] = '\xEF';
|
|
|
|
EXPECT_EQ(UniqueIdToHumanString(tmp),
|
|
|
|
"0000000000120000-0000AB0000000000-00EF0000");
|
|
|
|
|
|
|
|
tmp.resize(16);
|
|
|
|
EXPECT_EQ(UniqueIdToHumanString(tmp), "0000000000120000-0000AB0000000000");
|
|
|
|
|
|
|
|
tmp.resize(11);
|
|
|
|
EXPECT_EQ(UniqueIdToHumanString(tmp), "0000000000120000-0000AB");
|
|
|
|
|
|
|
|
tmp.resize(6);
|
|
|
|
EXPECT_EQ(UniqueIdToHumanString(tmp), "000000000012");
|
2022-05-18 01:43:48 +00:00
|
|
|
|
|
|
|
// Also internal IDs to human string
|
|
|
|
UniqueId64x3 euid = {12345, 678, 9};
|
|
|
|
EXPECT_EQ(InternalUniqueIdToHumanString(&euid), "{12345,678,9}");
|
|
|
|
|
|
|
|
UniqueId64x2 uid = {1234, 567890};
|
|
|
|
EXPECT_EQ(InternalUniqueIdToHumanString(&uid), "{1234,567890}");
|
Experimental support for SST unique IDs (#8990)
Summary:
* New public header unique_id.h and function GetUniqueIdFromTableProperties
which computes a universally unique identifier based on table properties
of table files from recent RocksDB versions.
* Generation of DB session IDs is refactored so that they are
guaranteed unique in the lifetime of a process running RocksDB.
(SemiStructuredUniqueIdGen, new test included.) Along with file numbers,
this enables SST unique IDs to be guaranteed unique among SSTs generated
in a single process, and "better than random" between processes.
See https://github.com/pdillinger/unique_id
* In addition to public API producing 'external' unique IDs, there is a function
for producing 'internal' unique IDs, with functions for converting between the
two. In short, the external ID is "safe" for things people might do with it, and
the internal ID enables more "power user" features for the future. Specifically,
the external ID goes through a hashing layer so that any subset of bits in the
external ID can be used as a hash of the full ID, while also preserving
uniqueness guarantees in the first 128 bits (bijective both on first 128 bits
and on full 192 bits).
Intended follow-up:
* Use the internal unique IDs in cache keys. (Avoid conflicts with https://github.com/facebook/rocksdb/issues/8912) (The file offset can be XORed into
the third 64-bit value of the unique ID.)
* Publish the external unique IDs in FileStorageInfo (https://github.com/facebook/rocksdb/issues/8968)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8990
Test Plan:
Unit tests added, and checking of unique ids in stress test.
NOTE in stress test we do not generate nearly enough files to thoroughly
stress uniqueness, but the test trims off pieces of the ID to check for
uniqueness so that we can infer (with some assumptions) stronger
properties in the aggregate.
Reviewed By: zhichao-cao, mrambacher
Differential Revision: D31582865
Pulled By: pdillinger
fbshipit-source-id: 1f620c4c86af9abe2a8d177b9ccf2ad2b9f48243
2021-10-19 06:28:28 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(TablePropertyTest, UniqueIdsFailure) {
|
|
|
|
TableProperties tp;
|
|
|
|
std::string tmp;
|
|
|
|
|
|
|
|
// Missing DB id
|
|
|
|
SetGoodTableProperties(&tp);
|
|
|
|
tp.db_id = "";
|
|
|
|
EXPECT_TRUE(GetUniqueIdFromTableProperties(tp, &tmp).IsNotSupported());
|
2022-05-18 01:43:48 +00:00
|
|
|
EXPECT_TRUE(
|
|
|
|
GetExtendedUniqueIdFromTableProperties(tp, &tmp).IsNotSupported());
|
Experimental support for SST unique IDs (#8990)
Summary:
* New public header unique_id.h and function GetUniqueIdFromTableProperties
which computes a universally unique identifier based on table properties
of table files from recent RocksDB versions.
* Generation of DB session IDs is refactored so that they are
guaranteed unique in the lifetime of a process running RocksDB.
(SemiStructuredUniqueIdGen, new test included.) Along with file numbers,
this enables SST unique IDs to be guaranteed unique among SSTs generated
in a single process, and "better than random" between processes.
See https://github.com/pdillinger/unique_id
* In addition to public API producing 'external' unique IDs, there is a function
for producing 'internal' unique IDs, with functions for converting between the
two. In short, the external ID is "safe" for things people might do with it, and
the internal ID enables more "power user" features for the future. Specifically,
the external ID goes through a hashing layer so that any subset of bits in the
external ID can be used as a hash of the full ID, while also preserving
uniqueness guarantees in the first 128 bits (bijective both on first 128 bits
and on full 192 bits).
Intended follow-up:
* Use the internal unique IDs in cache keys. (Avoid conflicts with https://github.com/facebook/rocksdb/issues/8912) (The file offset can be XORed into
the third 64-bit value of the unique ID.)
* Publish the external unique IDs in FileStorageInfo (https://github.com/facebook/rocksdb/issues/8968)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8990
Test Plan:
Unit tests added, and checking of unique ids in stress test.
NOTE in stress test we do not generate nearly enough files to thoroughly
stress uniqueness, but the test trims off pieces of the ID to check for
uniqueness so that we can infer (with some assumptions) stronger
properties in the aggregate.
Reviewed By: zhichao-cao, mrambacher
Differential Revision: D31582865
Pulled By: pdillinger
fbshipit-source-id: 1f620c4c86af9abe2a8d177b9ccf2ad2b9f48243
2021-10-19 06:28:28 +00:00
|
|
|
|
|
|
|
// Missing session id
|
|
|
|
SetGoodTableProperties(&tp);
|
|
|
|
tp.db_session_id = "";
|
|
|
|
EXPECT_TRUE(GetUniqueIdFromTableProperties(tp, &tmp).IsNotSupported());
|
2022-05-18 01:43:48 +00:00
|
|
|
EXPECT_TRUE(
|
|
|
|
GetExtendedUniqueIdFromTableProperties(tp, &tmp).IsNotSupported());
|
Experimental support for SST unique IDs (#8990)
Summary:
* New public header unique_id.h and function GetUniqueIdFromTableProperties
which computes a universally unique identifier based on table properties
of table files from recent RocksDB versions.
* Generation of DB session IDs is refactored so that they are
guaranteed unique in the lifetime of a process running RocksDB.
(SemiStructuredUniqueIdGen, new test included.) Along with file numbers,
this enables SST unique IDs to be guaranteed unique among SSTs generated
in a single process, and "better than random" between processes.
See https://github.com/pdillinger/unique_id
* In addition to public API producing 'external' unique IDs, there is a function
for producing 'internal' unique IDs, with functions for converting between the
two. In short, the external ID is "safe" for things people might do with it, and
the internal ID enables more "power user" features for the future. Specifically,
the external ID goes through a hashing layer so that any subset of bits in the
external ID can be used as a hash of the full ID, while also preserving
uniqueness guarantees in the first 128 bits (bijective both on first 128 bits
and on full 192 bits).
Intended follow-up:
* Use the internal unique IDs in cache keys. (Avoid conflicts with https://github.com/facebook/rocksdb/issues/8912) (The file offset can be XORed into
the third 64-bit value of the unique ID.)
* Publish the external unique IDs in FileStorageInfo (https://github.com/facebook/rocksdb/issues/8968)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8990
Test Plan:
Unit tests added, and checking of unique ids in stress test.
NOTE in stress test we do not generate nearly enough files to thoroughly
stress uniqueness, but the test trims off pieces of the ID to check for
uniqueness so that we can infer (with some assumptions) stronger
properties in the aggregate.
Reviewed By: zhichao-cao, mrambacher
Differential Revision: D31582865
Pulled By: pdillinger
fbshipit-source-id: 1f620c4c86af9abe2a8d177b9ccf2ad2b9f48243
2021-10-19 06:28:28 +00:00
|
|
|
|
|
|
|
// Missing file number
|
|
|
|
SetGoodTableProperties(&tp);
|
|
|
|
tp.orig_file_number = 0;
|
|
|
|
EXPECT_TRUE(GetUniqueIdFromTableProperties(tp, &tmp).IsNotSupported());
|
2022-05-18 01:43:48 +00:00
|
|
|
EXPECT_TRUE(
|
|
|
|
GetExtendedUniqueIdFromTableProperties(tp, &tmp).IsNotSupported());
|
Experimental support for SST unique IDs (#8990)
Summary:
* New public header unique_id.h and function GetUniqueIdFromTableProperties
which computes a universally unique identifier based on table properties
of table files from recent RocksDB versions.
* Generation of DB session IDs is refactored so that they are
guaranteed unique in the lifetime of a process running RocksDB.
(SemiStructuredUniqueIdGen, new test included.) Along with file numbers,
this enables SST unique IDs to be guaranteed unique among SSTs generated
in a single process, and "better than random" between processes.
See https://github.com/pdillinger/unique_id
* In addition to public API producing 'external' unique IDs, there is a function
for producing 'internal' unique IDs, with functions for converting between the
two. In short, the external ID is "safe" for things people might do with it, and
the internal ID enables more "power user" features for the future. Specifically,
the external ID goes through a hashing layer so that any subset of bits in the
external ID can be used as a hash of the full ID, while also preserving
uniqueness guarantees in the first 128 bits (bijective both on first 128 bits
and on full 192 bits).
Intended follow-up:
* Use the internal unique IDs in cache keys. (Avoid conflicts with https://github.com/facebook/rocksdb/issues/8912) (The file offset can be XORed into
the third 64-bit value of the unique ID.)
* Publish the external unique IDs in FileStorageInfo (https://github.com/facebook/rocksdb/issues/8968)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8990
Test Plan:
Unit tests added, and checking of unique ids in stress test.
NOTE in stress test we do not generate nearly enough files to thoroughly
stress uniqueness, but the test trims off pieces of the ID to check for
uniqueness so that we can infer (with some assumptions) stronger
properties in the aggregate.
Reviewed By: zhichao-cao, mrambacher
Differential Revision: D31582865
Pulled By: pdillinger
fbshipit-source-id: 1f620c4c86af9abe2a8d177b9ccf2ad2b9f48243
2021-10-19 06:28:28 +00:00
|
|
|
}
|
|
|
|
|
2013-10-10 18:43:24 +00:00
|
|
|
// This test include all the basic checks except those for index size and block
|
|
|
|
// size, which will be conducted in separated unit tests.
|
2018-06-05 02:59:44 +00:00
|
|
|
TEST_P(BlockBasedTableTest, BasicBlockBasedTableProperties) {
|
2016-08-19 22:10:31 +00:00
|
|
|
TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
|
2013-10-10 18:43:24 +00:00
|
|
|
|
|
|
|
c.Add("a1", "val1");
|
|
|
|
c.Add("b2", "val2");
|
|
|
|
c.Add("c3", "val3");
|
|
|
|
c.Add("d4", "val4");
|
|
|
|
c.Add("e5", "val5");
|
|
|
|
c.Add("f6", "val6");
|
|
|
|
c.Add("g7", "val7");
|
|
|
|
c.Add("h8", "val8");
|
|
|
|
c.Add("j9", "val9");
|
2016-08-19 22:10:31 +00:00
|
|
|
uint64_t diff_internal_user_bytes = 9 * 8; // 8 is seq size, 9 k-v totally
|
2013-10-10 18:43:24 +00:00
|
|
|
|
|
|
|
std::vector<std::string> keys;
|
2015-09-02 20:58:22 +00:00
|
|
|
stl_wrappers::KVMap kvmap;
|
2013-11-20 06:00:48 +00:00
|
|
|
Options options;
|
2013-10-10 18:43:24 +00:00
|
|
|
options.compression = kNoCompression;
|
2018-01-10 23:06:29 +00:00
|
|
|
options.statistics = CreateDBStatistics();
|
2019-03-01 18:39:00 +00:00
|
|
|
options.statistics->set_stats_level(StatsLevel::kAll);
|
2018-06-05 02:59:44 +00:00
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
2014-08-25 21:22:05 +00:00
|
|
|
table_options.block_restart_interval = 1;
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
2013-10-10 18:43:24 +00:00
|
|
|
|
2021-05-05 20:59:21 +00:00
|
|
|
ImmutableOptions ioptions(options);
|
2018-05-21 21:33:55 +00:00
|
|
|
MutableCFOptions moptions(options);
|
|
|
|
c.Finish(options, ioptions, moptions, table_options,
|
2014-08-25 21:22:05 +00:00
|
|
|
GetPlainInternalComparator(options.comparator), &keys, &kvmap);
|
2023-04-22 04:57:40 +00:00
|
|
|
ASSERT_EQ(
|
|
|
|
options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSION_REJECTED), 0);
|
2013-10-10 18:43:24 +00:00
|
|
|
|
2014-08-25 23:14:30 +00:00
|
|
|
auto& props = *c.GetTableReader()->GetTableProperties();
|
2013-11-20 00:29:42 +00:00
|
|
|
ASSERT_EQ(kvmap.size(), props.num_entries);
|
2013-10-10 18:43:24 +00:00
|
|
|
|
|
|
|
auto raw_key_size = kvmap.size() * 2ul;
|
|
|
|
auto raw_value_size = kvmap.size() * 4ul;
|
|
|
|
|
2016-08-19 22:10:31 +00:00
|
|
|
ASSERT_EQ(raw_key_size + diff_internal_user_bytes, props.raw_key_size);
|
2013-11-20 00:29:42 +00:00
|
|
|
ASSERT_EQ(raw_value_size, props.raw_value_size);
|
|
|
|
ASSERT_EQ(1ul, props.num_data_blocks);
|
|
|
|
ASSERT_EQ("", props.filter_policy_name); // no filter policy is used
|
2013-10-10 18:43:24 +00:00
|
|
|
|
|
|
|
// Verify data size.
|
2014-09-02 18:49:38 +00:00
|
|
|
BlockBuilder block_builder(1);
|
2013-10-10 18:43:24 +00:00
|
|
|
for (const auto& item : kvmap) {
|
|
|
|
block_builder.Add(item.first, item.second);
|
|
|
|
}
|
|
|
|
Slice content = block_builder.Finish();
|
Improve / clean up meta block code & integrity (#9163)
Summary:
* Checksums are now checked on meta blocks unless specifically
suppressed or not applicable (e.g. plain table). (Was other way around.)
This means a number of cases that were not checking checksums now are,
including direct read TableProperties in Version::GetTableProperties
(fixed in meta_blocks ReadTableProperties), reading any block from
PersistentCache (fixed in BlockFetcher), read TableProperties in
SstFileDumper (ldb/sst_dump/BackupEngine) before table reader open,
maybe more.
* For that to work, I moved the global_seqno+TableProperties checksum
logic to the shared table/ code, because that is used by many utilies
such as SstFileDumper.
* Also for that to work, we have to know when we're dealing with a block
that has a checksum (trailer), so added that capability to Footer based
on magic number, and from there BlockFetcher.
* Knowledge of trailer presence has also fixed a problem where other
table formats were reading blocks including bytes for a non-existant
trailer--and awkwardly kind-of not using them, e.g. no shared code
checking checksums. (BlockFetcher compression type was populated
incorrectly.) Now we only read what is needed.
* Minimized code duplication and differing/incompatible/awkward
abstractions in meta_blocks.{cc,h} (e.g. SeekTo in metaindex block
without parsing block handle)
* Moved some meta block handling code from table_properties*.*
* Moved some code specific to block-based table from shared table/ code
to BlockBasedTable class. The checksum stuff means we can't completely
separate it, but things that don't need to be in shared table/ code
should not be.
* Use unique_ptr rather than raw ptr in more places. (Note: you can
std::move from unique_ptr to shared_ptr.)
Without enhancements to GetPropertiesOfAllTablesTest (see below),
net reduction of roughly 100 lines of code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9163
Test Plan:
existing tests and
* Enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to verify that
checksums are now checked on direct read of table properties by TableCache
(new test would fail before this change)
* Also enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to test
putting table properties under old meta name
* Also generally enhanced that same test to actually test what it was
supposed to be testing already, by kicking things out of table cache when
we don't want them there.
Reviewed By: ajkr, mrambacher
Differential Revision: D32514757
Pulled By: pdillinger
fbshipit-source-id: 507964b9311d186ae8d1131182290cbd97a99fa9
2021-11-18 19:42:12 +00:00
|
|
|
ASSERT_EQ(content.size() + BlockBasedTable::kBlockTrailerSize +
|
|
|
|
diff_internal_user_bytes,
|
2016-08-19 22:10:31 +00:00
|
|
|
props.data_size);
|
Adding pin_l0_filter_and_index_blocks_in_cache feature and related fixes.
Summary:
When a block based table file is opened, if prefetch_index_and_filter is true, it will prefetch the index and filter blocks, putting them into the block cache.
What this feature adds: when a L0 block based table file is opened, if pin_l0_filter_and_index_blocks_in_cache is true in the options (and prefetch_index_and_filter is true), then the filter and index blocks aren't released back to the block cache at the end of BlockBasedTableReader::Open(). Instead the table reader takes ownership of them, hence pinning them, ie. the LRU cache will never push them out. Meanwhile in the table reader, further accesses will not hit the block cache, thus avoiding lock contention.
Test Plan:
'export TEST_TMPDIR=/dev/shm/ && DISABLE_JEMALLOC=1 OPT=-g make all valgrind_check -j32' is OK.
I didn't run the Java tests, I don't have Java set up on my devserver.
Reviewers: sdong
Reviewed By: sdong
Subscribers: andrewkr, dhruba
Differential Revision: https://reviews.facebook.net/D56133
2016-04-01 17:42:39 +00:00
|
|
|
c.ResetTableReader();
|
2013-10-10 18:43:24 +00:00
|
|
|
}
|
|
|
|
|
2018-01-17 01:26:29 +00:00
|
|
|
#ifdef SNAPPY
|
2018-01-10 23:06:29 +00:00
|
|
|
uint64_t BlockBasedTableTest::IndexUncompressedHelper(bool compressed) {
|
|
|
|
TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
|
|
|
|
constexpr size_t kNumKeys = 10000;
|
|
|
|
|
|
|
|
for (size_t k = 0; k < kNumKeys; ++k) {
|
2022-05-06 20:03:58 +00:00
|
|
|
c.Add("key" + std::to_string(k), "val" + std::to_string(k));
|
2018-01-10 23:06:29 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<std::string> keys;
|
|
|
|
stl_wrappers::KVMap kvmap;
|
|
|
|
Options options;
|
|
|
|
options.compression = kSnappyCompression;
|
|
|
|
options.statistics = CreateDBStatistics();
|
2019-03-01 18:39:00 +00:00
|
|
|
options.statistics->set_stats_level(StatsLevel::kAll);
|
2018-06-05 02:59:44 +00:00
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
2018-01-10 23:06:29 +00:00
|
|
|
table_options.block_restart_interval = 1;
|
|
|
|
table_options.enable_index_compression = compressed;
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
|
2021-05-05 20:59:21 +00:00
|
|
|
ImmutableOptions ioptions(options);
|
2018-05-21 21:33:55 +00:00
|
|
|
MutableCFOptions moptions(options);
|
|
|
|
c.Finish(options, ioptions, moptions, table_options,
|
2018-01-10 23:06:29 +00:00
|
|
|
GetPlainInternalComparator(options.comparator), &keys, &kvmap);
|
|
|
|
c.ResetTableReader();
|
|
|
|
return options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED);
|
|
|
|
}
|
2018-06-05 02:59:44 +00:00
|
|
|
TEST_P(BlockBasedTableTest, IndexUncompressed) {
|
2018-01-10 23:06:29 +00:00
|
|
|
uint64_t tbl1_compressed_cnt = IndexUncompressedHelper(true);
|
|
|
|
uint64_t tbl2_compressed_cnt = IndexUncompressedHelper(false);
|
|
|
|
// tbl1_compressed_cnt should include 1 index block
|
|
|
|
EXPECT_EQ(tbl2_compressed_cnt + 1, tbl1_compressed_cnt);
|
|
|
|
}
|
2018-01-17 01:26:29 +00:00
|
|
|
#endif // SNAPPY
|
2018-01-10 23:06:29 +00:00
|
|
|
|
2018-06-05 02:59:44 +00:00
|
|
|
TEST_P(BlockBasedTableTest, BlockBasedTableProperties2) {
|
2016-04-21 17:16:28 +00:00
|
|
|
TableConstructor c(&reverse_key_comparator);
|
|
|
|
std::vector<std::string> keys;
|
|
|
|
stl_wrappers::KVMap kvmap;
|
|
|
|
|
|
|
|
{
|
|
|
|
Options options;
|
2016-05-12 16:47:16 +00:00
|
|
|
options.compression = CompressionType::kNoCompression;
|
2018-06-05 02:59:44 +00:00
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
2016-04-21 17:16:28 +00:00
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
|
2021-05-05 20:59:21 +00:00
|
|
|
const ImmutableOptions ioptions(options);
|
2018-05-21 21:33:55 +00:00
|
|
|
const MutableCFOptions moptions(options);
|
|
|
|
c.Finish(options, ioptions, moptions, table_options,
|
2016-04-21 17:16:28 +00:00
|
|
|
GetPlainInternalComparator(options.comparator), &keys, &kvmap);
|
|
|
|
|
|
|
|
auto& props = *c.GetTableReader()->GetTableProperties();
|
|
|
|
|
|
|
|
// Default comparator
|
|
|
|
ASSERT_EQ("leveldb.BytewiseComparator", props.comparator_name);
|
|
|
|
// No merge operator
|
|
|
|
ASSERT_EQ("nullptr", props.merge_operator_name);
|
2016-08-26 18:46:32 +00:00
|
|
|
// No prefix extractor
|
|
|
|
ASSERT_EQ("nullptr", props.prefix_extractor_name);
|
2016-04-21 17:16:28 +00:00
|
|
|
// No property collectors
|
|
|
|
ASSERT_EQ("[]", props.property_collectors_names);
|
|
|
|
// No filter policy is used
|
|
|
|
ASSERT_EQ("", props.filter_policy_name);
|
2016-05-12 16:47:16 +00:00
|
|
|
// Compression type == that set:
|
|
|
|
ASSERT_EQ("NoCompression", props.compression_name);
|
2016-04-21 17:16:28 +00:00
|
|
|
c.ResetTableReader();
|
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
Options options;
|
2018-06-05 02:59:44 +00:00
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
2016-04-21 17:16:28 +00:00
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
options.comparator = &reverse_key_comparator;
|
|
|
|
options.merge_operator = MergeOperators::CreateUInt64AddOperator();
|
2016-08-26 18:46:32 +00:00
|
|
|
options.prefix_extractor.reset(NewNoopTransform());
|
2016-04-21 17:16:28 +00:00
|
|
|
options.table_properties_collector_factories.emplace_back(
|
|
|
|
new DummyPropertiesCollectorFactory1());
|
|
|
|
options.table_properties_collector_factories.emplace_back(
|
|
|
|
new DummyPropertiesCollectorFactory2());
|
|
|
|
|
2021-05-05 20:59:21 +00:00
|
|
|
const ImmutableOptions ioptions(options);
|
2018-05-21 21:33:55 +00:00
|
|
|
const MutableCFOptions moptions(options);
|
|
|
|
c.Finish(options, ioptions, moptions, table_options,
|
2016-04-21 17:16:28 +00:00
|
|
|
GetPlainInternalComparator(options.comparator), &keys, &kvmap);
|
|
|
|
|
|
|
|
auto& props = *c.GetTableReader()->GetTableProperties();
|
|
|
|
|
|
|
|
ASSERT_EQ("rocksdb.ReverseBytewiseComparator", props.comparator_name);
|
|
|
|
ASSERT_EQ("UInt64AddOperator", props.merge_operator_name);
|
2016-08-26 18:46:32 +00:00
|
|
|
ASSERT_EQ("rocksdb.Noop", props.prefix_extractor_name);
|
2020-10-01 22:21:17 +00:00
|
|
|
ASSERT_EQ(
|
|
|
|
"[DummyPropertiesCollectorFactory1,DummyPropertiesCollectorFactory2]",
|
|
|
|
props.property_collectors_names);
|
2016-04-21 17:16:28 +00:00
|
|
|
ASSERT_EQ("", props.filter_policy_name); // no filter policy is used
|
|
|
|
c.ResetTableReader();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-06-05 02:59:44 +00:00
|
|
|
TEST_P(BlockBasedTableTest, RangeDelBlock) {
|
2016-08-19 22:10:31 +00:00
|
|
|
TableConstructor c(BytewiseComparator());
|
|
|
|
std::vector<std::string> keys = {"1pika", "2chu"};
|
|
|
|
std::vector<std::string> vals = {"p", "c"};
|
|
|
|
|
Cache fragmented range tombstones in BlockBasedTableReader (#4493)
Summary:
This allows tombstone fragmenting to only be performed when the table is opened, and cached for subsequent accesses.
On the same DB used in #4449, running `readrandom` results in the following:
```
readrandom : 0.983 micros/op 1017076 ops/sec; 78.3 MB/s (63103 of 100000 found)
```
Now that Get performance in the presence of range tombstones is reasonable, I also compared the performance between a DB with range tombstones, "expanded" range tombstones (several point tombstones that cover the same keys the equivalent range tombstone would cover, a common workaround for DeleteRange), and no range tombstones. The created DBs had 5 million keys each, and DeleteRange was called at regular intervals (depending on the total number of range tombstones being written) after 4.5 million Puts. The table below summarizes the results of a `readwhilewriting` benchmark (in order to provide somewhat more realistic results):
```
Tombstones? | avg micros/op | stddev micros/op | avg ops/s | stddev ops/s
----------------- | ------------- | ---------------- | ------------ | ------------
None | 0.6186 | 0.04637 | 1,625,252.90 | 124,679.41
500 Expanded | 0.6019 | 0.03628 | 1,666,670.40 | 101,142.65
500 Unexpanded | 0.6435 | 0.03994 | 1,559,979.40 | 104,090.52
1k Expanded | 0.6034 | 0.04349 | 1,665,128.10 | 125,144.57
1k Unexpanded | 0.6261 | 0.03093 | 1,600,457.50 | 79,024.94
5k Expanded | 0.6163 | 0.05926 | 1,636,668.80 | 154,888.85
5k Unexpanded | 0.6402 | 0.04002 | 1,567,804.70 | 100,965.55
10k Expanded | 0.6036 | 0.05105 | 1,667,237.70 | 142,830.36
10k Unexpanded | 0.6128 | 0.02598 | 1,634,633.40 | 72,161.82
25k Expanded | 0.6198 | 0.04542 | 1,620,980.50 | 116,662.93
25k Unexpanded | 0.5478 | 0.0362 | 1,833,059.10 | 121,233.81
50k Expanded | 0.5104 | 0.04347 | 1,973,107.90 | 184,073.49
50k Unexpanded | 0.4528 | 0.03387 | 2,219,034.50 | 170,984.32
```
After a large enough quantity of range tombstones are written, range tombstone Gets can become faster than reading from an equivalent DB with several point tombstones.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4493
Differential Revision: D10842844
Pulled By: abhimadan
fbshipit-source-id: a7d44534f8120e6aabb65779d26c6b9df954c509
2018-10-26 02:25:00 +00:00
|
|
|
std::vector<RangeTombstone> expected_tombstones = {
|
|
|
|
{"1pika", "2chu", 0},
|
|
|
|
{"2chu", "c", 1},
|
|
|
|
{"2chu", "c", 0},
|
|
|
|
{"c", "p", 0},
|
|
|
|
};
|
|
|
|
|
2016-08-19 22:10:31 +00:00
|
|
|
for (int i = 0; i < 2; i++) {
|
|
|
|
RangeTombstone t(keys[i], vals[i], i);
|
|
|
|
std::pair<InternalKey, Slice> p = t.Serialize();
|
|
|
|
c.Add(p.first.Encode().ToString(), p.second);
|
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<std::string> sorted_keys;
|
|
|
|
stl_wrappers::KVMap kvmap;
|
|
|
|
Options options;
|
|
|
|
options.compression = kNoCompression;
|
2018-06-05 02:59:44 +00:00
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
2016-08-19 22:10:31 +00:00
|
|
|
table_options.block_restart_interval = 1;
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
|
2021-05-05 20:59:21 +00:00
|
|
|
const ImmutableOptions ioptions(options);
|
2018-05-21 21:33:55 +00:00
|
|
|
const MutableCFOptions moptions(options);
|
2016-08-19 22:10:31 +00:00
|
|
|
std::unique_ptr<InternalKeyComparator> internal_cmp(
|
|
|
|
new InternalKeyComparator(options.comparator));
|
2018-05-21 21:33:55 +00:00
|
|
|
c.Finish(options, ioptions, moptions, table_options, *internal_cmp,
|
|
|
|
&sorted_keys, &kvmap);
|
2016-08-19 22:10:31 +00:00
|
|
|
|
2016-11-05 16:10:51 +00:00
|
|
|
for (int j = 0; j < 2; ++j) {
|
|
|
|
std::unique_ptr<InternalIterator> iter(
|
|
|
|
c.GetTableReader()->NewRangeTombstoneIterator(ReadOptions()));
|
|
|
|
if (j > 0) {
|
|
|
|
// For second iteration, delete the table reader object and verify the
|
|
|
|
// iterator can still access its metablock's range tombstones.
|
|
|
|
c.ResetTableReader();
|
|
|
|
}
|
2017-03-09 06:16:45 +00:00
|
|
|
ASSERT_FALSE(iter->Valid());
|
2016-11-05 16:10:51 +00:00
|
|
|
iter->SeekToFirst();
|
2017-03-09 06:16:45 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
Cache fragmented range tombstones in BlockBasedTableReader (#4493)
Summary:
This allows tombstone fragmenting to only be performed when the table is opened, and cached for subsequent accesses.
On the same DB used in #4449, running `readrandom` results in the following:
```
readrandom : 0.983 micros/op 1017076 ops/sec; 78.3 MB/s (63103 of 100000 found)
```
Now that Get performance in the presence of range tombstones is reasonable, I also compared the performance between a DB with range tombstones, "expanded" range tombstones (several point tombstones that cover the same keys the equivalent range tombstone would cover, a common workaround for DeleteRange), and no range tombstones. The created DBs had 5 million keys each, and DeleteRange was called at regular intervals (depending on the total number of range tombstones being written) after 4.5 million Puts. The table below summarizes the results of a `readwhilewriting` benchmark (in order to provide somewhat more realistic results):
```
Tombstones? | avg micros/op | stddev micros/op | avg ops/s | stddev ops/s
----------------- | ------------- | ---------------- | ------------ | ------------
None | 0.6186 | 0.04637 | 1,625,252.90 | 124,679.41
500 Expanded | 0.6019 | 0.03628 | 1,666,670.40 | 101,142.65
500 Unexpanded | 0.6435 | 0.03994 | 1,559,979.40 | 104,090.52
1k Expanded | 0.6034 | 0.04349 | 1,665,128.10 | 125,144.57
1k Unexpanded | 0.6261 | 0.03093 | 1,600,457.50 | 79,024.94
5k Expanded | 0.6163 | 0.05926 | 1,636,668.80 | 154,888.85
5k Unexpanded | 0.6402 | 0.04002 | 1,567,804.70 | 100,965.55
10k Expanded | 0.6036 | 0.05105 | 1,667,237.70 | 142,830.36
10k Unexpanded | 0.6128 | 0.02598 | 1,634,633.40 | 72,161.82
25k Expanded | 0.6198 | 0.04542 | 1,620,980.50 | 116,662.93
25k Unexpanded | 0.5478 | 0.0362 | 1,833,059.10 | 121,233.81
50k Expanded | 0.5104 | 0.04347 | 1,973,107.90 | 184,073.49
50k Unexpanded | 0.4528 | 0.03387 | 2,219,034.50 | 170,984.32
```
After a large enough quantity of range tombstones are written, range tombstone Gets can become faster than reading from an equivalent DB with several point tombstones.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4493
Differential Revision: D10842844
Pulled By: abhimadan
fbshipit-source-id: a7d44534f8120e6aabb65779d26c6b9df954c509
2018-10-26 02:25:00 +00:00
|
|
|
for (size_t i = 0; i < expected_tombstones.size(); i++) {
|
2016-11-05 16:10:51 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ParsedInternalKey parsed_key;
|
2020-10-28 17:11:13 +00:00
|
|
|
ASSERT_OK(
|
|
|
|
ParseInternalKey(iter->key(), &parsed_key, true /* log_err_key */));
|
2016-11-05 16:10:51 +00:00
|
|
|
RangeTombstone t(parsed_key, iter->value());
|
Cache fragmented range tombstones in BlockBasedTableReader (#4493)
Summary:
This allows tombstone fragmenting to only be performed when the table is opened, and cached for subsequent accesses.
On the same DB used in #4449, running `readrandom` results in the following:
```
readrandom : 0.983 micros/op 1017076 ops/sec; 78.3 MB/s (63103 of 100000 found)
```
Now that Get performance in the presence of range tombstones is reasonable, I also compared the performance between a DB with range tombstones, "expanded" range tombstones (several point tombstones that cover the same keys the equivalent range tombstone would cover, a common workaround for DeleteRange), and no range tombstones. The created DBs had 5 million keys each, and DeleteRange was called at regular intervals (depending on the total number of range tombstones being written) after 4.5 million Puts. The table below summarizes the results of a `readwhilewriting` benchmark (in order to provide somewhat more realistic results):
```
Tombstones? | avg micros/op | stddev micros/op | avg ops/s | stddev ops/s
----------------- | ------------- | ---------------- | ------------ | ------------
None | 0.6186 | 0.04637 | 1,625,252.90 | 124,679.41
500 Expanded | 0.6019 | 0.03628 | 1,666,670.40 | 101,142.65
500 Unexpanded | 0.6435 | 0.03994 | 1,559,979.40 | 104,090.52
1k Expanded | 0.6034 | 0.04349 | 1,665,128.10 | 125,144.57
1k Unexpanded | 0.6261 | 0.03093 | 1,600,457.50 | 79,024.94
5k Expanded | 0.6163 | 0.05926 | 1,636,668.80 | 154,888.85
5k Unexpanded | 0.6402 | 0.04002 | 1,567,804.70 | 100,965.55
10k Expanded | 0.6036 | 0.05105 | 1,667,237.70 | 142,830.36
10k Unexpanded | 0.6128 | 0.02598 | 1,634,633.40 | 72,161.82
25k Expanded | 0.6198 | 0.04542 | 1,620,980.50 | 116,662.93
25k Unexpanded | 0.5478 | 0.0362 | 1,833,059.10 | 121,233.81
50k Expanded | 0.5104 | 0.04347 | 1,973,107.90 | 184,073.49
50k Unexpanded | 0.4528 | 0.03387 | 2,219,034.50 | 170,984.32
```
After a large enough quantity of range tombstones are written, range tombstone Gets can become faster than reading from an equivalent DB with several point tombstones.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4493
Differential Revision: D10842844
Pulled By: abhimadan
fbshipit-source-id: a7d44534f8120e6aabb65779d26c6b9df954c509
2018-10-26 02:25:00 +00:00
|
|
|
const auto& expected_t = expected_tombstones[i];
|
|
|
|
ASSERT_EQ(t.start_key_, expected_t.start_key_);
|
|
|
|
ASSERT_EQ(t.end_key_, expected_t.end_key_);
|
|
|
|
ASSERT_EQ(t.seq_, expected_t.seq_);
|
2016-11-05 16:10:51 +00:00
|
|
|
iter->Next();
|
|
|
|
}
|
|
|
|
ASSERT_TRUE(!iter->Valid());
|
|
|
|
}
|
2016-08-19 22:10:31 +00:00
|
|
|
}
|
|
|
|
|
2018-06-05 02:59:44 +00:00
|
|
|
TEST_P(BlockBasedTableTest, FilterPolicyNameProperties) {
|
2016-08-19 22:10:31 +00:00
|
|
|
TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
|
2013-10-16 23:57:20 +00:00
|
|
|
c.Add("a1", "val1");
|
|
|
|
std::vector<std::string> keys;
|
2015-09-02 20:58:22 +00:00
|
|
|
stl_wrappers::KVMap kvmap;
|
2018-06-05 02:59:44 +00:00
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
2014-08-25 21:22:05 +00:00
|
|
|
table_options.filter_policy.reset(NewBloomFilterPolicy(10));
|
2013-11-20 06:00:48 +00:00
|
|
|
Options options;
|
2014-08-25 21:22:05 +00:00
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
2013-10-16 23:57:20 +00:00
|
|
|
|
2021-05-05 20:59:21 +00:00
|
|
|
const ImmutableOptions ioptions(options);
|
2018-05-21 21:33:55 +00:00
|
|
|
const MutableCFOptions moptions(options);
|
|
|
|
c.Finish(options, ioptions, moptions, table_options,
|
2014-08-25 21:22:05 +00:00
|
|
|
GetPlainInternalComparator(options.comparator), &keys, &kvmap);
|
2014-08-25 23:14:30 +00:00
|
|
|
auto& props = *c.GetTableReader()->GetTableProperties();
|
2022-02-18 20:23:48 +00:00
|
|
|
ASSERT_EQ(table_options.filter_policy->Name(), props.filter_policy_name);
|
Adding pin_l0_filter_and_index_blocks_in_cache feature and related fixes.
Summary:
When a block based table file is opened, if prefetch_index_and_filter is true, it will prefetch the index and filter blocks, putting them into the block cache.
What this feature adds: when a L0 block based table file is opened, if pin_l0_filter_and_index_blocks_in_cache is true in the options (and prefetch_index_and_filter is true), then the filter and index blocks aren't released back to the block cache at the end of BlockBasedTableReader::Open(). Instead the table reader takes ownership of them, hence pinning them, ie. the LRU cache will never push them out. Meanwhile in the table reader, further accesses will not hit the block cache, thus avoiding lock contention.
Test Plan:
'export TEST_TMPDIR=/dev/shm/ && DISABLE_JEMALLOC=1 OPT=-g make all valgrind_check -j32' is OK.
I didn't run the Java tests, I don't have Java set up on my devserver.
Reviewers: sdong
Reviewed By: sdong
Subscribers: andrewkr, dhruba
Differential Revision: https://reviews.facebook.net/D56133
2016-04-01 17:42:39 +00:00
|
|
|
c.ResetTableReader();
|
2013-10-16 23:57:20 +00:00
|
|
|
}
|
|
|
|
|
2015-03-03 01:07:03 +00:00
|
|
|
//
|
|
|
|
// BlockBasedTableTest::PrefetchTest
|
|
|
|
//
|
|
|
|
void AssertKeysInCache(BlockBasedTable* table_reader,
|
2015-09-02 20:58:22 +00:00
|
|
|
const std::vector<std::string>& keys_in_cache,
|
2016-08-19 22:10:31 +00:00
|
|
|
const std::vector<std::string>& keys_not_in_cache,
|
|
|
|
bool convert = false) {
|
|
|
|
if (convert) {
|
2023-12-01 19:15:17 +00:00
|
|
|
for (const auto& key : keys_in_cache) {
|
2016-08-19 22:10:31 +00:00
|
|
|
InternalKey ikey(key, kMaxSequenceNumber, kTypeValue);
|
|
|
|
ASSERT_TRUE(table_reader->TEST_KeyInCache(ReadOptions(), ikey.Encode()));
|
|
|
|
}
|
2023-12-01 19:15:17 +00:00
|
|
|
for (const auto& key : keys_not_in_cache) {
|
2016-08-19 22:10:31 +00:00
|
|
|
InternalKey ikey(key, kMaxSequenceNumber, kTypeValue);
|
|
|
|
ASSERT_TRUE(!table_reader->TEST_KeyInCache(ReadOptions(), ikey.Encode()));
|
|
|
|
}
|
|
|
|
} else {
|
2023-12-01 19:15:17 +00:00
|
|
|
for (const auto& key : keys_in_cache) {
|
2016-08-19 22:10:31 +00:00
|
|
|
ASSERT_TRUE(table_reader->TEST_KeyInCache(ReadOptions(), key));
|
|
|
|
}
|
2023-12-01 19:15:17 +00:00
|
|
|
for (const auto& key : keys_not_in_cache) {
|
2016-08-19 22:10:31 +00:00
|
|
|
ASSERT_TRUE(!table_reader->TEST_KeyInCache(ReadOptions(), key));
|
|
|
|
}
|
2015-03-03 01:07:03 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void PrefetchRange(TableConstructor* c, Options* opt,
|
2016-08-19 22:10:31 +00:00
|
|
|
BlockBasedTableOptions* table_options, const char* key_begin,
|
2015-09-02 20:58:22 +00:00
|
|
|
const char* key_end,
|
|
|
|
const std::vector<std::string>& keys_in_cache,
|
|
|
|
const std::vector<std::string>& keys_not_in_cache,
|
2015-03-03 01:07:03 +00:00
|
|
|
const Status expected_status = Status::OK()) {
|
|
|
|
// reset the cache and reopen the table
|
2016-04-07 20:51:47 +00:00
|
|
|
table_options->block_cache = NewLRUCache(16 * 1024 * 1024, 4);
|
2015-03-03 01:07:03 +00:00
|
|
|
opt->table_factory.reset(NewBlockBasedTableFactory(*table_options));
|
2021-05-05 20:59:21 +00:00
|
|
|
const ImmutableOptions ioptions2(*opt);
|
2018-05-21 21:33:55 +00:00
|
|
|
const MutableCFOptions moptions(*opt);
|
|
|
|
ASSERT_OK(c->Reopen(ioptions2, moptions));
|
2015-03-03 01:07:03 +00:00
|
|
|
|
|
|
|
// prefetch
|
|
|
|
auto* table_reader = dynamic_cast<BlockBasedTable*>(c->GetTableReader());
|
2016-08-19 22:10:31 +00:00
|
|
|
Status s;
|
2018-11-09 19:17:34 +00:00
|
|
|
std::unique_ptr<Slice> begin, end;
|
|
|
|
std::unique_ptr<InternalKey> i_begin, i_end;
|
2016-08-19 22:10:31 +00:00
|
|
|
if (key_begin != nullptr) {
|
|
|
|
if (c->ConvertToInternalKey()) {
|
|
|
|
i_begin.reset(new InternalKey(key_begin, kMaxSequenceNumber, kTypeValue));
|
|
|
|
begin.reset(new Slice(i_begin->Encode()));
|
|
|
|
} else {
|
|
|
|
begin.reset(new Slice(key_begin));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (key_end != nullptr) {
|
|
|
|
if (c->ConvertToInternalKey()) {
|
|
|
|
i_end.reset(new InternalKey(key_end, kMaxSequenceNumber, kTypeValue));
|
|
|
|
end.reset(new Slice(i_end->Encode()));
|
|
|
|
} else {
|
|
|
|
end.reset(new Slice(key_end));
|
|
|
|
}
|
|
|
|
}
|
2023-04-21 16:07:18 +00:00
|
|
|
const ReadOptions read_options;
|
|
|
|
s = table_reader->Prefetch(read_options, begin.get(), end.get());
|
2016-08-19 22:10:31 +00:00
|
|
|
|
2015-03-03 01:07:03 +00:00
|
|
|
ASSERT_TRUE(s.code() == expected_status.code());
|
|
|
|
|
|
|
|
// assert our expectation in cache warmup
|
2016-08-19 22:10:31 +00:00
|
|
|
AssertKeysInCache(table_reader, keys_in_cache, keys_not_in_cache,
|
|
|
|
c->ConvertToInternalKey());
|
Adding pin_l0_filter_and_index_blocks_in_cache feature and related fixes.
Summary:
When a block based table file is opened, if prefetch_index_and_filter is true, it will prefetch the index and filter blocks, putting them into the block cache.
What this feature adds: when a L0 block based table file is opened, if pin_l0_filter_and_index_blocks_in_cache is true in the options (and prefetch_index_and_filter is true), then the filter and index blocks aren't released back to the block cache at the end of BlockBasedTableReader::Open(). Instead the table reader takes ownership of them, hence pinning them, ie. the LRU cache will never push them out. Meanwhile in the table reader, further accesses will not hit the block cache, thus avoiding lock contention.
Test Plan:
'export TEST_TMPDIR=/dev/shm/ && DISABLE_JEMALLOC=1 OPT=-g make all valgrind_check -j32' is OK.
I didn't run the Java tests, I don't have Java set up on my devserver.
Reviewers: sdong
Reviewed By: sdong
Subscribers: andrewkr, dhruba
Differential Revision: https://reviews.facebook.net/D56133
2016-04-01 17:42:39 +00:00
|
|
|
c->ResetTableReader();
|
2015-03-03 01:07:03 +00:00
|
|
|
}
|
|
|
|
|
2018-06-05 02:59:44 +00:00
|
|
|
TEST_P(BlockBasedTableTest, PrefetchTest) {
|
2015-03-03 01:07:03 +00:00
|
|
|
// The purpose of this test is to test the prefetching operation built into
|
|
|
|
// BlockBasedTable.
|
|
|
|
Options opt;
|
2018-11-09 19:17:34 +00:00
|
|
|
std::unique_ptr<InternalKeyComparator> ikc;
|
2015-03-03 01:07:03 +00:00
|
|
|
ikc.reset(new test::PlainInternalKeyComparator(opt.comparator));
|
|
|
|
opt.compression = kNoCompression;
|
2018-06-05 02:59:44 +00:00
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
2015-03-03 01:07:03 +00:00
|
|
|
table_options.block_size = 1024;
|
|
|
|
// big enough so we don't ever lose cached values.
|
2016-04-07 20:51:47 +00:00
|
|
|
table_options.block_cache = NewLRUCache(16 * 1024 * 1024, 4);
|
2015-03-03 01:07:03 +00:00
|
|
|
opt.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
|
2016-08-19 22:10:31 +00:00
|
|
|
TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
|
2015-03-03 01:07:03 +00:00
|
|
|
c.Add("k01", "hello");
|
|
|
|
c.Add("k02", "hello2");
|
|
|
|
c.Add("k03", std::string(10000, 'x'));
|
|
|
|
c.Add("k04", std::string(200000, 'x'));
|
|
|
|
c.Add("k05", std::string(300000, 'x'));
|
|
|
|
c.Add("k06", "hello3");
|
|
|
|
c.Add("k07", std::string(100000, 'x'));
|
|
|
|
std::vector<std::string> keys;
|
2015-09-02 20:58:22 +00:00
|
|
|
stl_wrappers::KVMap kvmap;
|
2021-05-05 20:59:21 +00:00
|
|
|
const ImmutableOptions ioptions(opt);
|
2018-05-21 21:33:55 +00:00
|
|
|
const MutableCFOptions moptions(opt);
|
|
|
|
c.Finish(opt, ioptions, moptions, table_options, *ikc, &keys, &kvmap);
|
Adding pin_l0_filter_and_index_blocks_in_cache feature and related fixes.
Summary:
When a block based table file is opened, if prefetch_index_and_filter is true, it will prefetch the index and filter blocks, putting them into the block cache.
What this feature adds: when a L0 block based table file is opened, if pin_l0_filter_and_index_blocks_in_cache is true in the options (and prefetch_index_and_filter is true), then the filter and index blocks aren't released back to the block cache at the end of BlockBasedTableReader::Open(). Instead the table reader takes ownership of them, hence pinning them, ie. the LRU cache will never push them out. Meanwhile in the table reader, further accesses will not hit the block cache, thus avoiding lock contention.
Test Plan:
'export TEST_TMPDIR=/dev/shm/ && DISABLE_JEMALLOC=1 OPT=-g make all valgrind_check -j32' is OK.
I didn't run the Java tests, I don't have Java set up on my devserver.
Reviewers: sdong
Reviewed By: sdong
Subscribers: andrewkr, dhruba
Differential Revision: https://reviews.facebook.net/D56133
2016-04-01 17:42:39 +00:00
|
|
|
c.ResetTableReader();
|
2015-03-03 01:07:03 +00:00
|
|
|
|
|
|
|
// We get the following data spread :
|
|
|
|
//
|
|
|
|
// Data block Index
|
|
|
|
// ========================
|
|
|
|
// [ k01 k02 k03 ] k03
|
|
|
|
// [ k04 ] k04
|
|
|
|
// [ k05 ] k05
|
|
|
|
// [ k06 k07 ] k07
|
|
|
|
|
|
|
|
// Simple
|
2016-08-19 22:10:31 +00:00
|
|
|
PrefetchRange(&c, &opt, &table_options,
|
|
|
|
/*key_range=*/"k01", "k05",
|
|
|
|
/*keys_in_cache=*/{"k01", "k02", "k03", "k04", "k05"},
|
|
|
|
/*keys_not_in_cache=*/{"k06", "k07"});
|
|
|
|
PrefetchRange(&c, &opt, &table_options, "k01", "k01", {"k01", "k02", "k03"},
|
2015-03-03 01:07:03 +00:00
|
|
|
{"k04", "k05", "k06", "k07"});
|
|
|
|
// odd
|
2016-08-19 22:10:31 +00:00
|
|
|
PrefetchRange(&c, &opt, &table_options, "a", "z",
|
|
|
|
{"k01", "k02", "k03", "k04", "k05", "k06", "k07"}, {});
|
|
|
|
PrefetchRange(&c, &opt, &table_options, "k00", "k00", {"k01", "k02", "k03"},
|
2015-03-03 01:07:03 +00:00
|
|
|
{"k04", "k05", "k06", "k07"});
|
|
|
|
// Edge cases
|
2016-08-19 22:10:31 +00:00
|
|
|
PrefetchRange(&c, &opt, &table_options, "k00", "k06",
|
|
|
|
{"k01", "k02", "k03", "k04", "k05", "k06", "k07"}, {});
|
|
|
|
PrefetchRange(&c, &opt, &table_options, "k00", "zzz",
|
|
|
|
{"k01", "k02", "k03", "k04", "k05", "k06", "k07"}, {});
|
2015-03-03 01:07:03 +00:00
|
|
|
// null keys
|
2016-08-19 22:10:31 +00:00
|
|
|
PrefetchRange(&c, &opt, &table_options, nullptr, nullptr,
|
|
|
|
{"k01", "k02", "k03", "k04", "k05", "k06", "k07"}, {});
|
|
|
|
PrefetchRange(&c, &opt, &table_options, "k04", nullptr,
|
|
|
|
{"k04", "k05", "k06", "k07"}, {"k01", "k02", "k03"});
|
|
|
|
PrefetchRange(&c, &opt, &table_options, nullptr, "k05",
|
|
|
|
{"k01", "k02", "k03", "k04", "k05"}, {"k06", "k07"});
|
2015-03-03 01:07:03 +00:00
|
|
|
// invalid
|
2016-08-19 22:10:31 +00:00
|
|
|
PrefetchRange(&c, &opt, &table_options, "k06", "k00", {}, {},
|
2015-03-03 01:07:03 +00:00
|
|
|
Status::InvalidArgument(Slice("k06 "), Slice("k07")));
|
Adding pin_l0_filter_and_index_blocks_in_cache feature and related fixes.
Summary:
When a block based table file is opened, if prefetch_index_and_filter is true, it will prefetch the index and filter blocks, putting them into the block cache.
What this feature adds: when a L0 block based table file is opened, if pin_l0_filter_and_index_blocks_in_cache is true in the options (and prefetch_index_and_filter is true), then the filter and index blocks aren't released back to the block cache at the end of BlockBasedTableReader::Open(). Instead the table reader takes ownership of them, hence pinning them, ie. the LRU cache will never push them out. Meanwhile in the table reader, further accesses will not hit the block cache, thus avoiding lock contention.
Test Plan:
'export TEST_TMPDIR=/dev/shm/ && DISABLE_JEMALLOC=1 OPT=-g make all valgrind_check -j32' is OK.
I didn't run the Java tests, I don't have Java set up on my devserver.
Reviewers: sdong
Reviewed By: sdong
Subscribers: andrewkr, dhruba
Differential Revision: https://reviews.facebook.net/D56133
2016-04-01 17:42:39 +00:00
|
|
|
c.ResetTableReader();
|
2015-03-03 01:07:03 +00:00
|
|
|
}
|
|
|
|
|
2018-06-05 02:59:44 +00:00
|
|
|
TEST_P(BlockBasedTableTest, TotalOrderSeekOnHashIndex) {
|
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
2022-03-01 21:58:02 +00:00
|
|
|
for (int i = 0; i <= 4; ++i) {
|
2014-08-25 23:14:30 +00:00
|
|
|
Options options;
|
|
|
|
// Make each key/value an individual block
|
|
|
|
table_options.block_size = 64;
|
|
|
|
switch (i) {
|
2022-10-25 18:50:38 +00:00
|
|
|
case 0:
|
|
|
|
// Binary search index
|
|
|
|
table_options.index_type = BlockBasedTableOptions::kBinarySearch;
|
|
|
|
options.table_factory.reset(new BlockBasedTableFactory(table_options));
|
|
|
|
break;
|
|
|
|
case 1:
|
|
|
|
// Hash search index
|
|
|
|
table_options.index_type = BlockBasedTableOptions::kHashSearch;
|
|
|
|
options.table_factory.reset(new BlockBasedTableFactory(table_options));
|
|
|
|
options.prefix_extractor.reset(NewFixedPrefixTransform(4));
|
|
|
|
break;
|
|
|
|
case 2:
|
|
|
|
// Hash search index with filter policy
|
|
|
|
table_options.index_type = BlockBasedTableOptions::kHashSearch;
|
|
|
|
table_options.filter_policy.reset(NewBloomFilterPolicy(10));
|
|
|
|
options.table_factory.reset(new BlockBasedTableFactory(table_options));
|
|
|
|
options.prefix_extractor.reset(NewFixedPrefixTransform(4));
|
|
|
|
break;
|
|
|
|
case 3:
|
|
|
|
// Two-level index
|
|
|
|
table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch;
|
|
|
|
options.table_factory.reset(new BlockBasedTableFactory(table_options));
|
|
|
|
break;
|
|
|
|
case 4:
|
|
|
|
// Binary search with first key
|
|
|
|
table_options.index_type =
|
|
|
|
BlockBasedTableOptions::kBinarySearchWithFirstKey;
|
|
|
|
options.table_factory.reset(new BlockBasedTableFactory(table_options));
|
|
|
|
break;
|
2014-08-25 23:14:30 +00:00
|
|
|
}
|
|
|
|
|
2016-08-19 22:10:31 +00:00
|
|
|
TableConstructor c(BytewiseComparator(),
|
|
|
|
true /* convert_to_internal_key_ */);
|
2014-08-25 23:14:30 +00:00
|
|
|
c.Add("aaaa1", std::string('a', 56));
|
|
|
|
c.Add("bbaa1", std::string('a', 56));
|
|
|
|
c.Add("cccc1", std::string('a', 56));
|
|
|
|
c.Add("bbbb1", std::string('a', 56));
|
|
|
|
c.Add("baaa1", std::string('a', 56));
|
|
|
|
c.Add("abbb1", std::string('a', 56));
|
|
|
|
c.Add("cccc2", std::string('a', 56));
|
|
|
|
std::vector<std::string> keys;
|
2015-09-02 20:58:22 +00:00
|
|
|
stl_wrappers::KVMap kvmap;
|
2021-05-05 20:59:21 +00:00
|
|
|
const ImmutableOptions ioptions(options);
|
2018-05-21 21:33:55 +00:00
|
|
|
const MutableCFOptions moptions(options);
|
|
|
|
c.Finish(options, ioptions, moptions, table_options,
|
2014-08-25 23:14:30 +00:00
|
|
|
GetPlainInternalComparator(options.comparator), &keys, &kvmap);
|
|
|
|
auto props = c.GetTableReader()->GetTableProperties();
|
|
|
|
ASSERT_EQ(7u, props->num_data_blocks);
|
|
|
|
auto* reader = c.GetTableReader();
|
|
|
|
ReadOptions ro;
|
|
|
|
ro.total_order_seek = true;
|
2019-06-20 21:28:22 +00:00
|
|
|
std::unique_ptr<InternalIterator> iter(reader->NewIterator(
|
|
|
|
ro, moptions.prefix_extractor.get(), /*arena=*/nullptr,
|
|
|
|
/*skip_filters=*/false, TableReaderCaller::kUncategorized));
|
2014-08-25 23:14:30 +00:00
|
|
|
|
|
|
|
iter->Seek(InternalKey("b", 0, kTypeValue).Encode());
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ("baaa1", ExtractUserKey(iter->key()).ToString());
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ("bbaa1", ExtractUserKey(iter->key()).ToString());
|
|
|
|
|
|
|
|
iter->Seek(InternalKey("bb", 0, kTypeValue).Encode());
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ("bbaa1", ExtractUserKey(iter->key()).ToString());
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ("bbbb1", ExtractUserKey(iter->key()).ToString());
|
|
|
|
|
|
|
|
iter->Seek(InternalKey("bbb", 0, kTypeValue).Encode());
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ("bbbb1", ExtractUserKey(iter->key()).ToString());
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ("cccc1", ExtractUserKey(iter->key()).ToString());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-06-05 02:59:44 +00:00
|
|
|
TEST_P(BlockBasedTableTest, NoopTransformSeek) {
|
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
2016-01-06 19:46:32 +00:00
|
|
|
table_options.filter_policy.reset(NewBloomFilterPolicy(10));
|
|
|
|
|
|
|
|
Options options;
|
|
|
|
options.comparator = BytewiseComparator();
|
|
|
|
options.table_factory.reset(new BlockBasedTableFactory(table_options));
|
|
|
|
options.prefix_extractor.reset(NewNoopTransform());
|
|
|
|
|
|
|
|
TableConstructor c(options.comparator);
|
|
|
|
// To tickle the PrefixMayMatch bug it is important that the
|
|
|
|
// user-key is a single byte so that the index key exactly matches
|
|
|
|
// the user-key.
|
|
|
|
InternalKey key("a", 1, kTypeValue);
|
|
|
|
c.Add(key.Encode().ToString(), "b");
|
|
|
|
std::vector<std::string> keys;
|
|
|
|
stl_wrappers::KVMap kvmap;
|
2021-05-05 20:59:21 +00:00
|
|
|
const ImmutableOptions ioptions(options);
|
2018-05-21 21:33:55 +00:00
|
|
|
const MutableCFOptions moptions(options);
|
2016-01-07 17:48:29 +00:00
|
|
|
const InternalKeyComparator internal_comparator(options.comparator);
|
2018-05-21 21:33:55 +00:00
|
|
|
c.Finish(options, ioptions, moptions, table_options, internal_comparator,
|
|
|
|
&keys, &kvmap);
|
2016-01-06 19:46:32 +00:00
|
|
|
|
|
|
|
auto* reader = c.GetTableReader();
|
|
|
|
for (int i = 0; i < 2; ++i) {
|
|
|
|
ReadOptions ro;
|
|
|
|
ro.total_order_seek = (i == 0);
|
2019-06-20 21:28:22 +00:00
|
|
|
std::unique_ptr<InternalIterator> iter(reader->NewIterator(
|
|
|
|
ro, moptions.prefix_extractor.get(), /*arena=*/nullptr,
|
|
|
|
/*skip_filters=*/false, TableReaderCaller::kUncategorized));
|
2016-01-06 19:46:32 +00:00
|
|
|
|
|
|
|
iter->Seek(key.Encode());
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ("a", ExtractUserKey(iter->key()).ToString());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-06-05 02:59:44 +00:00
|
|
|
TEST_P(BlockBasedTableTest, SkipPrefixBloomFilter) {
|
2016-08-26 18:46:32 +00:00
|
|
|
// if DB is opened with a prefix extractor of a different name,
|
|
|
|
// prefix bloom is skipped when read the file
|
2018-06-05 02:59:44 +00:00
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
2016-08-26 18:46:32 +00:00
|
|
|
table_options.filter_policy.reset(NewBloomFilterPolicy(2));
|
|
|
|
table_options.whole_key_filtering = false;
|
|
|
|
|
|
|
|
Options options;
|
|
|
|
options.comparator = BytewiseComparator();
|
|
|
|
options.table_factory.reset(new BlockBasedTableFactory(table_options));
|
|
|
|
options.prefix_extractor.reset(NewFixedPrefixTransform(1));
|
|
|
|
|
|
|
|
TableConstructor c(options.comparator);
|
|
|
|
InternalKey key("abcdefghijk", 1, kTypeValue);
|
|
|
|
c.Add(key.Encode().ToString(), "test");
|
|
|
|
std::vector<std::string> keys;
|
|
|
|
stl_wrappers::KVMap kvmap;
|
2021-05-05 20:59:21 +00:00
|
|
|
const ImmutableOptions ioptions(options);
|
2018-05-21 21:33:55 +00:00
|
|
|
const MutableCFOptions moptions(options);
|
2016-08-26 18:46:32 +00:00
|
|
|
const InternalKeyComparator internal_comparator(options.comparator);
|
2018-05-21 21:33:55 +00:00
|
|
|
c.Finish(options, ioptions, moptions, table_options, internal_comparator,
|
|
|
|
&keys, &kvmap);
|
|
|
|
// TODO(Zhongyi): update test to use MutableCFOptions
|
2016-08-26 18:46:32 +00:00
|
|
|
options.prefix_extractor.reset(NewFixedPrefixTransform(9));
|
2021-05-05 20:59:21 +00:00
|
|
|
const ImmutableOptions new_ioptions(options);
|
2018-05-21 21:33:55 +00:00
|
|
|
const MutableCFOptions new_moptions(options);
|
2020-10-08 18:11:52 +00:00
|
|
|
ASSERT_OK(c.Reopen(new_ioptions, new_moptions));
|
2016-08-26 18:46:32 +00:00
|
|
|
auto reader = c.GetTableReader();
|
2020-08-03 22:21:56 +00:00
|
|
|
ReadOptions read_options;
|
2019-06-20 21:28:22 +00:00
|
|
|
std::unique_ptr<InternalIterator> db_iter(reader->NewIterator(
|
2020-08-03 22:21:56 +00:00
|
|
|
read_options, new_moptions.prefix_extractor.get(), /*arena=*/nullptr,
|
2019-06-20 21:28:22 +00:00
|
|
|
/*skip_filters=*/false, TableReaderCaller::kUncategorized));
|
2016-08-26 18:46:32 +00:00
|
|
|
|
|
|
|
// Test point lookup
|
|
|
|
// only one kv
|
|
|
|
for (auto& kv : kvmap) {
|
|
|
|
db_iter->Seek(kv.first);
|
|
|
|
ASSERT_TRUE(db_iter->Valid());
|
|
|
|
ASSERT_OK(db_iter->status());
|
|
|
|
ASSERT_EQ(db_iter->key(), kv.first);
|
|
|
|
ASSERT_EQ(db_iter->value(), kv.second);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Implement XXH3 block checksum type (#9069)
Summary:
XXH3 - latest hash function that is extremely fast on large
data, easily faster than crc32c on most any x86_64 hardware. In
integrating this hash function, I have handled the compression type byte
in a non-standard way to avoid using the streaming API (extra data
movement and active code size because of hash function complexity). This
approach got a thumbs-up from Yann Collet.
Existing functionality change:
* reject bad ChecksumType in options with InvalidArgument
This change split off from https://github.com/facebook/rocksdb/issues/9058 because context-aware checksum is
likely to be handled through different configuration than ChecksumType.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9069
Test Plan:
tests updated, and substantially expanded. Unit tests now check
that we don't accidentally change the values generated by the checksum
algorithms ("schema test") and that we properly handle
invalid/unrecognized checksum types in options or in file footer.
DBTestBase::ChangeOptions (etc.) updated from two to one configuration
changing from default CRC32c ChecksumType. The point of this test code
is to detect possible interactions among features, and the likelihood of
some bad interaction being detected by including configurations other
than XXH3 and CRC32c--and then not detected by stress/crash test--is
extremely low.
Stress/crash test also updated (manual run long enough to see it accepts
new checksum type). db_bench also updated for microbenchmarking
checksums.
### Performance microbenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
./db_bench -benchmarks=crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3
crc32c : 0.200 micros/op 5005220 ops/sec; 19551.6 MB/s (4096 per op)
xxhash : 0.807 micros/op 1238408 ops/sec; 4837.5 MB/s (4096 per op)
xxhash64 : 0.421 micros/op 2376514 ops/sec; 9283.3 MB/s (4096 per op)
xxh3 : 0.171 micros/op 5858391 ops/sec; 22884.3 MB/s (4096 per op)
crc32c : 0.206 micros/op 4859566 ops/sec; 18982.7 MB/s (4096 per op)
xxhash : 0.793 micros/op 1260850 ops/sec; 4925.2 MB/s (4096 per op)
xxhash64 : 0.410 micros/op 2439182 ops/sec; 9528.1 MB/s (4096 per op)
xxh3 : 0.161 micros/op 6202872 ops/sec; 24230.0 MB/s (4096 per op)
crc32c : 0.203 micros/op 4924686 ops/sec; 19237.1 MB/s (4096 per op)
xxhash : 0.839 micros/op 1192388 ops/sec; 4657.8 MB/s (4096 per op)
xxhash64 : 0.424 micros/op 2357391 ops/sec; 9208.6 MB/s (4096 per op)
xxh3 : 0.162 micros/op 6182678 ops/sec; 24151.1 MB/s (4096 per op)
As you can see, especially once warmed up, xxh3 is fastest.
### Performance macrobenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
Test
for I in `seq 1 50`; do for CHK in 0 1 2 3 4; do TEST_TMPDIR=/dev/shm/rocksdb$CHK ./db_bench -benchmarks=fillseq -memtablerep=vector -allow_concurrent_memtable_write=false -num=30000000 -checksum_type=$CHK 2>&1 | grep 'micros/op' | tee -a results-$CHK & done; wait; done
Results (ops/sec)
for FILE in results*; do echo -n "$FILE "; awk '{ s += $5; c++; } END { print 1.0 * s / c; }' < $FILE; done
results-0 252118 # kNoChecksum
results-1 251588 # kCRC32c
results-2 251863 # kxxHash
results-3 252016 # kxxHash64
results-4 252038 # kXXH3
Reviewed By: mrambacher
Differential Revision: D31905249
Pulled By: pdillinger
fbshipit-source-id: cb9b998ebe2523fc7c400eedf62124a78bf4b4d1
2021-10-29 05:13:47 +00:00
|
|
|
TEST_P(BlockBasedTableTest, BadChecksumType) {
|
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
|
|
|
|
|
|
|
Options options;
|
|
|
|
options.comparator = BytewiseComparator();
|
|
|
|
options.table_factory.reset(new BlockBasedTableFactory(table_options));
|
|
|
|
|
|
|
|
TableConstructor c(options.comparator);
|
|
|
|
InternalKey key("abc", 1, kTypeValue);
|
|
|
|
c.Add(key.Encode().ToString(), "test");
|
|
|
|
std::vector<std::string> keys;
|
|
|
|
stl_wrappers::KVMap kvmap;
|
|
|
|
const ImmutableOptions ioptions(options);
|
|
|
|
const MutableCFOptions moptions(options);
|
|
|
|
const InternalKeyComparator internal_comparator(options.comparator);
|
|
|
|
c.Finish(options, ioptions, moptions, table_options, internal_comparator,
|
|
|
|
&keys, &kvmap);
|
|
|
|
|
|
|
|
// Corrupt checksum type (123 is invalid)
|
|
|
|
auto& sink = *c.TEST_GetSink();
|
|
|
|
size_t len = sink.contents_.size();
|
2022-10-22 01:09:12 +00:00
|
|
|
ASSERT_EQ(sink.contents_[len - Footer::kNewVersionsEncodedLength],
|
|
|
|
table_options.checksum);
|
Implement XXH3 block checksum type (#9069)
Summary:
XXH3 - latest hash function that is extremely fast on large
data, easily faster than crc32c on most any x86_64 hardware. In
integrating this hash function, I have handled the compression type byte
in a non-standard way to avoid using the streaming API (extra data
movement and active code size because of hash function complexity). This
approach got a thumbs-up from Yann Collet.
Existing functionality change:
* reject bad ChecksumType in options with InvalidArgument
This change split off from https://github.com/facebook/rocksdb/issues/9058 because context-aware checksum is
likely to be handled through different configuration than ChecksumType.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9069
Test Plan:
tests updated, and substantially expanded. Unit tests now check
that we don't accidentally change the values generated by the checksum
algorithms ("schema test") and that we properly handle
invalid/unrecognized checksum types in options or in file footer.
DBTestBase::ChangeOptions (etc.) updated from two to one configuration
changing from default CRC32c ChecksumType. The point of this test code
is to detect possible interactions among features, and the likelihood of
some bad interaction being detected by including configurations other
than XXH3 and CRC32c--and then not detected by stress/crash test--is
extremely low.
Stress/crash test also updated (manual run long enough to see it accepts
new checksum type). db_bench also updated for microbenchmarking
checksums.
### Performance microbenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
./db_bench -benchmarks=crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3
crc32c : 0.200 micros/op 5005220 ops/sec; 19551.6 MB/s (4096 per op)
xxhash : 0.807 micros/op 1238408 ops/sec; 4837.5 MB/s (4096 per op)
xxhash64 : 0.421 micros/op 2376514 ops/sec; 9283.3 MB/s (4096 per op)
xxh3 : 0.171 micros/op 5858391 ops/sec; 22884.3 MB/s (4096 per op)
crc32c : 0.206 micros/op 4859566 ops/sec; 18982.7 MB/s (4096 per op)
xxhash : 0.793 micros/op 1260850 ops/sec; 4925.2 MB/s (4096 per op)
xxhash64 : 0.410 micros/op 2439182 ops/sec; 9528.1 MB/s (4096 per op)
xxh3 : 0.161 micros/op 6202872 ops/sec; 24230.0 MB/s (4096 per op)
crc32c : 0.203 micros/op 4924686 ops/sec; 19237.1 MB/s (4096 per op)
xxhash : 0.839 micros/op 1192388 ops/sec; 4657.8 MB/s (4096 per op)
xxhash64 : 0.424 micros/op 2357391 ops/sec; 9208.6 MB/s (4096 per op)
xxh3 : 0.162 micros/op 6182678 ops/sec; 24151.1 MB/s (4096 per op)
As you can see, especially once warmed up, xxh3 is fastest.
### Performance macrobenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
Test
for I in `seq 1 50`; do for CHK in 0 1 2 3 4; do TEST_TMPDIR=/dev/shm/rocksdb$CHK ./db_bench -benchmarks=fillseq -memtablerep=vector -allow_concurrent_memtable_write=false -num=30000000 -checksum_type=$CHK 2>&1 | grep 'micros/op' | tee -a results-$CHK & done; wait; done
Results (ops/sec)
for FILE in results*; do echo -n "$FILE "; awk '{ s += $5; c++; } END { print 1.0 * s / c; }' < $FILE; done
results-0 252118 # kNoChecksum
results-1 251588 # kCRC32c
results-2 251863 # kxxHash
results-3 252016 # kxxHash64
results-4 252038 # kXXH3
Reviewed By: mrambacher
Differential Revision: D31905249
Pulled By: pdillinger
fbshipit-source-id: cb9b998ebe2523fc7c400eedf62124a78bf4b4d1
2021-10-29 05:13:47 +00:00
|
|
|
sink.contents_[len - Footer::kNewVersionsEncodedLength] = char{123};
|
|
|
|
|
|
|
|
// (Re-)Open table file with bad checksum type
|
|
|
|
const ImmutableOptions new_ioptions(options);
|
|
|
|
const MutableCFOptions new_moptions(options);
|
|
|
|
Status s = c.Reopen(new_ioptions, new_moptions);
|
|
|
|
ASSERT_NOK(s);
|
2022-12-09 18:03:47 +00:00
|
|
|
// "test" is file name
|
2021-12-10 16:12:09 +00:00
|
|
|
ASSERT_EQ(s.ToString(),
|
2022-12-09 18:03:47 +00:00
|
|
|
"Corruption: Corrupt or unsupported checksum type: 123 in test");
|
Implement XXH3 block checksum type (#9069)
Summary:
XXH3 - latest hash function that is extremely fast on large
data, easily faster than crc32c on most any x86_64 hardware. In
integrating this hash function, I have handled the compression type byte
in a non-standard way to avoid using the streaming API (extra data
movement and active code size because of hash function complexity). This
approach got a thumbs-up from Yann Collet.
Existing functionality change:
* reject bad ChecksumType in options with InvalidArgument
This change split off from https://github.com/facebook/rocksdb/issues/9058 because context-aware checksum is
likely to be handled through different configuration than ChecksumType.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9069
Test Plan:
tests updated, and substantially expanded. Unit tests now check
that we don't accidentally change the values generated by the checksum
algorithms ("schema test") and that we properly handle
invalid/unrecognized checksum types in options or in file footer.
DBTestBase::ChangeOptions (etc.) updated from two to one configuration
changing from default CRC32c ChecksumType. The point of this test code
is to detect possible interactions among features, and the likelihood of
some bad interaction being detected by including configurations other
than XXH3 and CRC32c--and then not detected by stress/crash test--is
extremely low.
Stress/crash test also updated (manual run long enough to see it accepts
new checksum type). db_bench also updated for microbenchmarking
checksums.
### Performance microbenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
./db_bench -benchmarks=crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3
crc32c : 0.200 micros/op 5005220 ops/sec; 19551.6 MB/s (4096 per op)
xxhash : 0.807 micros/op 1238408 ops/sec; 4837.5 MB/s (4096 per op)
xxhash64 : 0.421 micros/op 2376514 ops/sec; 9283.3 MB/s (4096 per op)
xxh3 : 0.171 micros/op 5858391 ops/sec; 22884.3 MB/s (4096 per op)
crc32c : 0.206 micros/op 4859566 ops/sec; 18982.7 MB/s (4096 per op)
xxhash : 0.793 micros/op 1260850 ops/sec; 4925.2 MB/s (4096 per op)
xxhash64 : 0.410 micros/op 2439182 ops/sec; 9528.1 MB/s (4096 per op)
xxh3 : 0.161 micros/op 6202872 ops/sec; 24230.0 MB/s (4096 per op)
crc32c : 0.203 micros/op 4924686 ops/sec; 19237.1 MB/s (4096 per op)
xxhash : 0.839 micros/op 1192388 ops/sec; 4657.8 MB/s (4096 per op)
xxhash64 : 0.424 micros/op 2357391 ops/sec; 9208.6 MB/s (4096 per op)
xxh3 : 0.162 micros/op 6182678 ops/sec; 24151.1 MB/s (4096 per op)
As you can see, especially once warmed up, xxh3 is fastest.
### Performance macrobenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
Test
for I in `seq 1 50`; do for CHK in 0 1 2 3 4; do TEST_TMPDIR=/dev/shm/rocksdb$CHK ./db_bench -benchmarks=fillseq -memtablerep=vector -allow_concurrent_memtable_write=false -num=30000000 -checksum_type=$CHK 2>&1 | grep 'micros/op' | tee -a results-$CHK & done; wait; done
Results (ops/sec)
for FILE in results*; do echo -n "$FILE "; awk '{ s += $5; c++; } END { print 1.0 * s / c; }' < $FILE; done
results-0 252118 # kNoChecksum
results-1 251588 # kCRC32c
results-2 251863 # kxxHash
results-3 252016 # kxxHash64
results-4 252038 # kXXH3
Reviewed By: mrambacher
Differential Revision: D31905249
Pulled By: pdillinger
fbshipit-source-id: cb9b998ebe2523fc7c400eedf62124a78bf4b4d1
2021-10-29 05:13:47 +00:00
|
|
|
}
|
|
|
|
|
2023-03-06 19:53:09 +00:00
|
|
|
class BuiltinChecksumTest : public testing::Test,
|
|
|
|
public testing::WithParamInterface<ChecksumType> {};
|
|
|
|
|
|
|
|
INSTANTIATE_TEST_CASE_P(SupportedChecksums, BuiltinChecksumTest,
|
|
|
|
testing::ValuesIn(GetSupportedChecksums()));
|
|
|
|
|
Implement XXH3 block checksum type (#9069)
Summary:
XXH3 - latest hash function that is extremely fast on large
data, easily faster than crc32c on most any x86_64 hardware. In
integrating this hash function, I have handled the compression type byte
in a non-standard way to avoid using the streaming API (extra data
movement and active code size because of hash function complexity). This
approach got a thumbs-up from Yann Collet.
Existing functionality change:
* reject bad ChecksumType in options with InvalidArgument
This change split off from https://github.com/facebook/rocksdb/issues/9058 because context-aware checksum is
likely to be handled through different configuration than ChecksumType.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9069
Test Plan:
tests updated, and substantially expanded. Unit tests now check
that we don't accidentally change the values generated by the checksum
algorithms ("schema test") and that we properly handle
invalid/unrecognized checksum types in options or in file footer.
DBTestBase::ChangeOptions (etc.) updated from two to one configuration
changing from default CRC32c ChecksumType. The point of this test code
is to detect possible interactions among features, and the likelihood of
some bad interaction being detected by including configurations other
than XXH3 and CRC32c--and then not detected by stress/crash test--is
extremely low.
Stress/crash test also updated (manual run long enough to see it accepts
new checksum type). db_bench also updated for microbenchmarking
checksums.
### Performance microbenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
./db_bench -benchmarks=crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3
crc32c : 0.200 micros/op 5005220 ops/sec; 19551.6 MB/s (4096 per op)
xxhash : 0.807 micros/op 1238408 ops/sec; 4837.5 MB/s (4096 per op)
xxhash64 : 0.421 micros/op 2376514 ops/sec; 9283.3 MB/s (4096 per op)
xxh3 : 0.171 micros/op 5858391 ops/sec; 22884.3 MB/s (4096 per op)
crc32c : 0.206 micros/op 4859566 ops/sec; 18982.7 MB/s (4096 per op)
xxhash : 0.793 micros/op 1260850 ops/sec; 4925.2 MB/s (4096 per op)
xxhash64 : 0.410 micros/op 2439182 ops/sec; 9528.1 MB/s (4096 per op)
xxh3 : 0.161 micros/op 6202872 ops/sec; 24230.0 MB/s (4096 per op)
crc32c : 0.203 micros/op 4924686 ops/sec; 19237.1 MB/s (4096 per op)
xxhash : 0.839 micros/op 1192388 ops/sec; 4657.8 MB/s (4096 per op)
xxhash64 : 0.424 micros/op 2357391 ops/sec; 9208.6 MB/s (4096 per op)
xxh3 : 0.162 micros/op 6182678 ops/sec; 24151.1 MB/s (4096 per op)
As you can see, especially once warmed up, xxh3 is fastest.
### Performance macrobenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
Test
for I in `seq 1 50`; do for CHK in 0 1 2 3 4; do TEST_TMPDIR=/dev/shm/rocksdb$CHK ./db_bench -benchmarks=fillseq -memtablerep=vector -allow_concurrent_memtable_write=false -num=30000000 -checksum_type=$CHK 2>&1 | grep 'micros/op' | tee -a results-$CHK & done; wait; done
Results (ops/sec)
for FILE in results*; do echo -n "$FILE "; awk '{ s += $5; c++; } END { print 1.0 * s / c; }' < $FILE; done
results-0 252118 # kNoChecksum
results-1 251588 # kCRC32c
results-2 251863 # kxxHash
results-3 252016 # kxxHash64
results-4 252038 # kXXH3
Reviewed By: mrambacher
Differential Revision: D31905249
Pulled By: pdillinger
fbshipit-source-id: cb9b998ebe2523fc7c400eedf62124a78bf4b4d1
2021-10-29 05:13:47 +00:00
|
|
|
namespace {
|
2021-11-04 16:08:12 +00:00
|
|
|
std::string ChecksumAsString(const std::string& data,
|
|
|
|
ChecksumType checksum_type) {
|
|
|
|
uint32_t v = ComputeBuiltinChecksum(checksum_type, data.data(), data.size());
|
|
|
|
|
|
|
|
// Verify consistency with other function
|
|
|
|
if (data.size() >= 1) {
|
|
|
|
EXPECT_EQ(v, ComputeBuiltinChecksumWithLastByte(
|
|
|
|
checksum_type, data.data(), data.size() - 1, data.back()));
|
|
|
|
}
|
|
|
|
// Little endian as in file
|
|
|
|
std::array<char, 4> raw_bytes;
|
|
|
|
EncodeFixed32(raw_bytes.data(), v);
|
|
|
|
return Slice(raw_bytes.data(), raw_bytes.size()).ToString(/*hex*/ true);
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string ChecksumAsString(std::string* data, char new_last_byte,
|
|
|
|
ChecksumType checksum_type) {
|
|
|
|
data->back() = new_last_byte;
|
|
|
|
return ChecksumAsString(*data, checksum_type);
|
Implement XXH3 block checksum type (#9069)
Summary:
XXH3 - latest hash function that is extremely fast on large
data, easily faster than crc32c on most any x86_64 hardware. In
integrating this hash function, I have handled the compression type byte
in a non-standard way to avoid using the streaming API (extra data
movement and active code size because of hash function complexity). This
approach got a thumbs-up from Yann Collet.
Existing functionality change:
* reject bad ChecksumType in options with InvalidArgument
This change split off from https://github.com/facebook/rocksdb/issues/9058 because context-aware checksum is
likely to be handled through different configuration than ChecksumType.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9069
Test Plan:
tests updated, and substantially expanded. Unit tests now check
that we don't accidentally change the values generated by the checksum
algorithms ("schema test") and that we properly handle
invalid/unrecognized checksum types in options or in file footer.
DBTestBase::ChangeOptions (etc.) updated from two to one configuration
changing from default CRC32c ChecksumType. The point of this test code
is to detect possible interactions among features, and the likelihood of
some bad interaction being detected by including configurations other
than XXH3 and CRC32c--and then not detected by stress/crash test--is
extremely low.
Stress/crash test also updated (manual run long enough to see it accepts
new checksum type). db_bench also updated for microbenchmarking
checksums.
### Performance microbenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
./db_bench -benchmarks=crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3
crc32c : 0.200 micros/op 5005220 ops/sec; 19551.6 MB/s (4096 per op)
xxhash : 0.807 micros/op 1238408 ops/sec; 4837.5 MB/s (4096 per op)
xxhash64 : 0.421 micros/op 2376514 ops/sec; 9283.3 MB/s (4096 per op)
xxh3 : 0.171 micros/op 5858391 ops/sec; 22884.3 MB/s (4096 per op)
crc32c : 0.206 micros/op 4859566 ops/sec; 18982.7 MB/s (4096 per op)
xxhash : 0.793 micros/op 1260850 ops/sec; 4925.2 MB/s (4096 per op)
xxhash64 : 0.410 micros/op 2439182 ops/sec; 9528.1 MB/s (4096 per op)
xxh3 : 0.161 micros/op 6202872 ops/sec; 24230.0 MB/s (4096 per op)
crc32c : 0.203 micros/op 4924686 ops/sec; 19237.1 MB/s (4096 per op)
xxhash : 0.839 micros/op 1192388 ops/sec; 4657.8 MB/s (4096 per op)
xxhash64 : 0.424 micros/op 2357391 ops/sec; 9208.6 MB/s (4096 per op)
xxh3 : 0.162 micros/op 6182678 ops/sec; 24151.1 MB/s (4096 per op)
As you can see, especially once warmed up, xxh3 is fastest.
### Performance macrobenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
Test
for I in `seq 1 50`; do for CHK in 0 1 2 3 4; do TEST_TMPDIR=/dev/shm/rocksdb$CHK ./db_bench -benchmarks=fillseq -memtablerep=vector -allow_concurrent_memtable_write=false -num=30000000 -checksum_type=$CHK 2>&1 | grep 'micros/op' | tee -a results-$CHK & done; wait; done
Results (ops/sec)
for FILE in results*; do echo -n "$FILE "; awk '{ s += $5; c++; } END { print 1.0 * s / c; }' < $FILE; done
results-0 252118 # kNoChecksum
results-1 251588 # kCRC32c
results-2 251863 # kxxHash
results-3 252016 # kxxHash64
results-4 252038 # kXXH3
Reviewed By: mrambacher
Differential Revision: D31905249
Pulled By: pdillinger
fbshipit-source-id: cb9b998ebe2523fc7c400eedf62124a78bf4b4d1
2021-10-29 05:13:47 +00:00
|
|
|
}
|
|
|
|
} // namespace
|
|
|
|
|
|
|
|
// Make sure that checksum values don't change in later versions, even if
|
2021-11-04 16:08:12 +00:00
|
|
|
// consistent within current version.
|
2023-03-06 19:53:09 +00:00
|
|
|
TEST_P(BuiltinChecksumTest, ChecksumSchemas) {
|
|
|
|
// Trailing 'x' chars will be replaced by compression type. Specifically,
|
|
|
|
// the first byte of a block trailer is compression type, which is part of
|
|
|
|
// the checksum input. This test does not deal with storing or parsing
|
|
|
|
// checksums from the trailer (next 4 bytes of trailer).
|
2021-11-04 16:08:12 +00:00
|
|
|
std::string b0 = "x";
|
|
|
|
std::string b1 = "This is a short block!x";
|
Implement XXH3 block checksum type (#9069)
Summary:
XXH3 - latest hash function that is extremely fast on large
data, easily faster than crc32c on most any x86_64 hardware. In
integrating this hash function, I have handled the compression type byte
in a non-standard way to avoid using the streaming API (extra data
movement and active code size because of hash function complexity). This
approach got a thumbs-up from Yann Collet.
Existing functionality change:
* reject bad ChecksumType in options with InvalidArgument
This change split off from https://github.com/facebook/rocksdb/issues/9058 because context-aware checksum is
likely to be handled through different configuration than ChecksumType.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9069
Test Plan:
tests updated, and substantially expanded. Unit tests now check
that we don't accidentally change the values generated by the checksum
algorithms ("schema test") and that we properly handle
invalid/unrecognized checksum types in options or in file footer.
DBTestBase::ChangeOptions (etc.) updated from two to one configuration
changing from default CRC32c ChecksumType. The point of this test code
is to detect possible interactions among features, and the likelihood of
some bad interaction being detected by including configurations other
than XXH3 and CRC32c--and then not detected by stress/crash test--is
extremely low.
Stress/crash test also updated (manual run long enough to see it accepts
new checksum type). db_bench also updated for microbenchmarking
checksums.
### Performance microbenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
./db_bench -benchmarks=crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3
crc32c : 0.200 micros/op 5005220 ops/sec; 19551.6 MB/s (4096 per op)
xxhash : 0.807 micros/op 1238408 ops/sec; 4837.5 MB/s (4096 per op)
xxhash64 : 0.421 micros/op 2376514 ops/sec; 9283.3 MB/s (4096 per op)
xxh3 : 0.171 micros/op 5858391 ops/sec; 22884.3 MB/s (4096 per op)
crc32c : 0.206 micros/op 4859566 ops/sec; 18982.7 MB/s (4096 per op)
xxhash : 0.793 micros/op 1260850 ops/sec; 4925.2 MB/s (4096 per op)
xxhash64 : 0.410 micros/op 2439182 ops/sec; 9528.1 MB/s (4096 per op)
xxh3 : 0.161 micros/op 6202872 ops/sec; 24230.0 MB/s (4096 per op)
crc32c : 0.203 micros/op 4924686 ops/sec; 19237.1 MB/s (4096 per op)
xxhash : 0.839 micros/op 1192388 ops/sec; 4657.8 MB/s (4096 per op)
xxhash64 : 0.424 micros/op 2357391 ops/sec; 9208.6 MB/s (4096 per op)
xxh3 : 0.162 micros/op 6182678 ops/sec; 24151.1 MB/s (4096 per op)
As you can see, especially once warmed up, xxh3 is fastest.
### Performance macrobenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
Test
for I in `seq 1 50`; do for CHK in 0 1 2 3 4; do TEST_TMPDIR=/dev/shm/rocksdb$CHK ./db_bench -benchmarks=fillseq -memtablerep=vector -allow_concurrent_memtable_write=false -num=30000000 -checksum_type=$CHK 2>&1 | grep 'micros/op' | tee -a results-$CHK & done; wait; done
Results (ops/sec)
for FILE in results*; do echo -n "$FILE "; awk '{ s += $5; c++; } END { print 1.0 * s / c; }' < $FILE; done
results-0 252118 # kNoChecksum
results-1 251588 # kCRC32c
results-2 251863 # kxxHash
results-3 252016 # kxxHash64
results-4 252038 # kXXH3
Reviewed By: mrambacher
Differential Revision: D31905249
Pulled By: pdillinger
fbshipit-source-id: cb9b998ebe2523fc7c400eedf62124a78bf4b4d1
2021-10-29 05:13:47 +00:00
|
|
|
std::string b2;
|
|
|
|
for (int i = 0; i < 100; ++i) {
|
|
|
|
b2.append("This is a long block!");
|
|
|
|
}
|
2021-11-04 16:08:12 +00:00
|
|
|
b2.append("x");
|
|
|
|
|
|
|
|
std::string empty;
|
|
|
|
|
|
|
|
char ct1 = kNoCompression;
|
|
|
|
char ct2 = kSnappyCompression;
|
|
|
|
char ct3 = kZSTD;
|
Implement XXH3 block checksum type (#9069)
Summary:
XXH3 - latest hash function that is extremely fast on large
data, easily faster than crc32c on most any x86_64 hardware. In
integrating this hash function, I have handled the compression type byte
in a non-standard way to avoid using the streaming API (extra data
movement and active code size because of hash function complexity). This
approach got a thumbs-up from Yann Collet.
Existing functionality change:
* reject bad ChecksumType in options with InvalidArgument
This change split off from https://github.com/facebook/rocksdb/issues/9058 because context-aware checksum is
likely to be handled through different configuration than ChecksumType.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9069
Test Plan:
tests updated, and substantially expanded. Unit tests now check
that we don't accidentally change the values generated by the checksum
algorithms ("schema test") and that we properly handle
invalid/unrecognized checksum types in options or in file footer.
DBTestBase::ChangeOptions (etc.) updated from two to one configuration
changing from default CRC32c ChecksumType. The point of this test code
is to detect possible interactions among features, and the likelihood of
some bad interaction being detected by including configurations other
than XXH3 and CRC32c--and then not detected by stress/crash test--is
extremely low.
Stress/crash test also updated (manual run long enough to see it accepts
new checksum type). db_bench also updated for microbenchmarking
checksums.
### Performance microbenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
./db_bench -benchmarks=crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3
crc32c : 0.200 micros/op 5005220 ops/sec; 19551.6 MB/s (4096 per op)
xxhash : 0.807 micros/op 1238408 ops/sec; 4837.5 MB/s (4096 per op)
xxhash64 : 0.421 micros/op 2376514 ops/sec; 9283.3 MB/s (4096 per op)
xxh3 : 0.171 micros/op 5858391 ops/sec; 22884.3 MB/s (4096 per op)
crc32c : 0.206 micros/op 4859566 ops/sec; 18982.7 MB/s (4096 per op)
xxhash : 0.793 micros/op 1260850 ops/sec; 4925.2 MB/s (4096 per op)
xxhash64 : 0.410 micros/op 2439182 ops/sec; 9528.1 MB/s (4096 per op)
xxh3 : 0.161 micros/op 6202872 ops/sec; 24230.0 MB/s (4096 per op)
crc32c : 0.203 micros/op 4924686 ops/sec; 19237.1 MB/s (4096 per op)
xxhash : 0.839 micros/op 1192388 ops/sec; 4657.8 MB/s (4096 per op)
xxhash64 : 0.424 micros/op 2357391 ops/sec; 9208.6 MB/s (4096 per op)
xxh3 : 0.162 micros/op 6182678 ops/sec; 24151.1 MB/s (4096 per op)
As you can see, especially once warmed up, xxh3 is fastest.
### Performance macrobenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
Test
for I in `seq 1 50`; do for CHK in 0 1 2 3 4; do TEST_TMPDIR=/dev/shm/rocksdb$CHK ./db_bench -benchmarks=fillseq -memtablerep=vector -allow_concurrent_memtable_write=false -num=30000000 -checksum_type=$CHK 2>&1 | grep 'micros/op' | tee -a results-$CHK & done; wait; done
Results (ops/sec)
for FILE in results*; do echo -n "$FILE "; awk '{ s += $5; c++; } END { print 1.0 * s / c; }' < $FILE; done
results-0 252118 # kNoChecksum
results-1 251588 # kCRC32c
results-2 251863 # kxxHash
results-3 252016 # kxxHash64
results-4 252038 # kXXH3
Reviewed By: mrambacher
Differential Revision: D31905249
Pulled By: pdillinger
fbshipit-source-id: cb9b998ebe2523fc7c400eedf62124a78bf4b4d1
2021-10-29 05:13:47 +00:00
|
|
|
|
2023-03-06 19:53:09 +00:00
|
|
|
ChecksumType t = GetParam();
|
|
|
|
switch (t) {
|
|
|
|
case kNoChecksum:
|
|
|
|
EXPECT_EQ(ChecksumAsString(empty, t), "00000000");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b0, ct1, t), "00000000");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b0, ct2, t), "00000000");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b0, ct3, t), "00000000");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b1, ct1, t), "00000000");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b1, ct2, t), "00000000");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b1, ct3, t), "00000000");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b2, ct1, t), "00000000");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b2, ct2, t), "00000000");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b2, ct3, t), "00000000");
|
|
|
|
break;
|
|
|
|
case kCRC32c:
|
|
|
|
EXPECT_EQ(ChecksumAsString(empty, t), "D8EA82A2");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b0, ct1, t), "D28F2549");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b0, ct2, t), "052B2843");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b0, ct3, t), "46F8F711");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b1, ct1, t), "583F0355");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b1, ct2, t), "2F9B0A57");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b1, ct3, t), "ECE7DA1D");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b2, ct1, t), "943EF0AB");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b2, ct2, t), "43A2EDB1");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b2, ct3, t), "00E53D63");
|
|
|
|
break;
|
|
|
|
case kxxHash:
|
|
|
|
EXPECT_EQ(ChecksumAsString(empty, t), "055DCC02");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b0, ct1, t), "3EB065CF");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b0, ct2, t), "31F79238");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b0, ct3, t), "320D2E00");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b1, ct1, t), "4A2E5FB0");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b1, ct2, t), "0BD9F652");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b1, ct3, t), "B4107E50");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b2, ct1, t), "20F4D4BA");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b2, ct2, t), "8F1A1F99");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b2, ct3, t), "A191A338");
|
|
|
|
break;
|
|
|
|
case kxxHash64:
|
|
|
|
EXPECT_EQ(ChecksumAsString(empty, t), "99E9D851");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b0, ct1, t), "682705DB");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b0, ct2, t), "30E7211B");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b0, ct3, t), "B7BB58E8");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b1, ct1, t), "B74655EF");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b1, ct2, t), "B6C8BBBE");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b1, ct3, t), "AED9E3B4");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b2, ct1, t), "0D4999FE");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b2, ct2, t), "F5932423");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b2, ct3, t), "6B31BAB1");
|
|
|
|
break;
|
|
|
|
case kXXH3:
|
|
|
|
EXPECT_EQ(ChecksumAsString(empty, t), "00000000");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b0, ct1, t), "C294D338");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b0, ct2, t), "1B174353");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b0, ct3, t), "2D0E20C8");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b1, ct1, t), "B37FB5E6");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b1, ct2, t), "6AFC258D");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b1, ct3, t), "5CE54616");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b2, ct1, t), "FA2D482E");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b2, ct2, t), "23AED845");
|
|
|
|
EXPECT_EQ(ChecksumAsString(&b2, ct3, t), "15B7BBDE");
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
// Force this test to be updated on new ChecksumTypes
|
|
|
|
assert(false);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(BuiltinChecksumTest, ChecksumZeroInputs) {
|
|
|
|
// Verify that no reasonably sized "all zeros" inputs produce "all zeros"
|
|
|
|
// output. Otherwise, "wiped" data could appear to be well-formed.
|
|
|
|
// Assuming essentially random assignment of output values, the likelihood
|
|
|
|
// of encountering checksum == 0 for an input not specifically crafted is
|
|
|
|
// 1 in 4 billion.
|
|
|
|
if (GetParam() == kNoChecksum) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
// "Thorough" case is too slow for continouous testing
|
|
|
|
bool thorough = getenv("ROCKSDB_THOROUGH_CHECKSUM_TEST") != nullptr;
|
|
|
|
// Verified through 10M
|
|
|
|
size_t kMaxZerosLen = thorough ? 10000000 : 20000;
|
|
|
|
std::string zeros(kMaxZerosLen, '\0');
|
|
|
|
|
|
|
|
for (size_t len = 0; len < kMaxZerosLen; ++len) {
|
|
|
|
if (thorough && (len & 0xffffU) == 0) {
|
|
|
|
fprintf(stderr, "t=%u len=%u\n", (unsigned)GetParam(), (unsigned)len);
|
|
|
|
}
|
|
|
|
uint32_t v = ComputeBuiltinChecksum(GetParam(), zeros.data(), len);
|
|
|
|
if (v == 0U) {
|
|
|
|
// One exception case:
|
|
|
|
if (GetParam() == kXXH3 && len == 0) {
|
|
|
|
// This is not a big deal because assuming the block length is known
|
|
|
|
// from the block handle, which comes from a checksum-verified block,
|
|
|
|
// there is nothing to corrupt in a zero-length block. And when there
|
|
|
|
// is a block trailer with compression byte (as in block-based table),
|
|
|
|
// zero length checksummed data never arises.
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
// Only compute this on failure
|
|
|
|
SCOPED_TRACE("len=" + std::to_string(len));
|
|
|
|
ASSERT_NE(v, 0U);
|
Implement XXH3 block checksum type (#9069)
Summary:
XXH3 - latest hash function that is extremely fast on large
data, easily faster than crc32c on most any x86_64 hardware. In
integrating this hash function, I have handled the compression type byte
in a non-standard way to avoid using the streaming API (extra data
movement and active code size because of hash function complexity). This
approach got a thumbs-up from Yann Collet.
Existing functionality change:
* reject bad ChecksumType in options with InvalidArgument
This change split off from https://github.com/facebook/rocksdb/issues/9058 because context-aware checksum is
likely to be handled through different configuration than ChecksumType.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9069
Test Plan:
tests updated, and substantially expanded. Unit tests now check
that we don't accidentally change the values generated by the checksum
algorithms ("schema test") and that we properly handle
invalid/unrecognized checksum types in options or in file footer.
DBTestBase::ChangeOptions (etc.) updated from two to one configuration
changing from default CRC32c ChecksumType. The point of this test code
is to detect possible interactions among features, and the likelihood of
some bad interaction being detected by including configurations other
than XXH3 and CRC32c--and then not detected by stress/crash test--is
extremely low.
Stress/crash test also updated (manual run long enough to see it accepts
new checksum type). db_bench also updated for microbenchmarking
checksums.
### Performance microbenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
./db_bench -benchmarks=crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3
crc32c : 0.200 micros/op 5005220 ops/sec; 19551.6 MB/s (4096 per op)
xxhash : 0.807 micros/op 1238408 ops/sec; 4837.5 MB/s (4096 per op)
xxhash64 : 0.421 micros/op 2376514 ops/sec; 9283.3 MB/s (4096 per op)
xxh3 : 0.171 micros/op 5858391 ops/sec; 22884.3 MB/s (4096 per op)
crc32c : 0.206 micros/op 4859566 ops/sec; 18982.7 MB/s (4096 per op)
xxhash : 0.793 micros/op 1260850 ops/sec; 4925.2 MB/s (4096 per op)
xxhash64 : 0.410 micros/op 2439182 ops/sec; 9528.1 MB/s (4096 per op)
xxh3 : 0.161 micros/op 6202872 ops/sec; 24230.0 MB/s (4096 per op)
crc32c : 0.203 micros/op 4924686 ops/sec; 19237.1 MB/s (4096 per op)
xxhash : 0.839 micros/op 1192388 ops/sec; 4657.8 MB/s (4096 per op)
xxhash64 : 0.424 micros/op 2357391 ops/sec; 9208.6 MB/s (4096 per op)
xxh3 : 0.162 micros/op 6182678 ops/sec; 24151.1 MB/s (4096 per op)
As you can see, especially once warmed up, xxh3 is fastest.
### Performance macrobenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
Test
for I in `seq 1 50`; do for CHK in 0 1 2 3 4; do TEST_TMPDIR=/dev/shm/rocksdb$CHK ./db_bench -benchmarks=fillseq -memtablerep=vector -allow_concurrent_memtable_write=false -num=30000000 -checksum_type=$CHK 2>&1 | grep 'micros/op' | tee -a results-$CHK & done; wait; done
Results (ops/sec)
for FILE in results*; do echo -n "$FILE "; awk '{ s += $5; c++; } END { print 1.0 * s / c; }' < $FILE; done
results-0 252118 # kNoChecksum
results-1 251588 # kCRC32c
results-2 251863 # kxxHash
results-3 252016 # kxxHash64
results-4 252038 # kXXH3
Reviewed By: mrambacher
Differential Revision: D31905249
Pulled By: pdillinger
fbshipit-source-id: cb9b998ebe2523fc7c400eedf62124a78bf4b4d1
2021-10-29 05:13:47 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-09-26 16:14:05 +00:00
|
|
|
void AddInternalKey(TableConstructor* c, const std::string& prefix,
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-25 03:50:35 +00:00
|
|
|
std::string value = "v", int /*suffix_len*/ = 800) {
|
2014-04-10 21:19:43 +00:00
|
|
|
static Random rnd(1023);
|
2020-07-09 21:33:42 +00:00
|
|
|
InternalKey k(prefix + rnd.RandomString(800), 0, kTypeValue);
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-25 03:50:35 +00:00
|
|
|
c->Add(k.Encode().ToString(), value);
|
2014-04-10 21:19:43 +00:00
|
|
|
}
|
|
|
|
|
2017-02-07 00:29:29 +00:00
|
|
|
void TableTest::IndexTest(BlockBasedTableOptions table_options) {
|
2014-04-10 21:19:43 +00:00
|
|
|
TableConstructor c(BytewiseComparator());
|
|
|
|
|
|
|
|
// keys with prefix length 3, make sure the key/value is big enough to fill
|
|
|
|
// one block
|
|
|
|
AddInternalKey(&c, "0015");
|
|
|
|
AddInternalKey(&c, "0035");
|
|
|
|
|
|
|
|
AddInternalKey(&c, "0054");
|
|
|
|
AddInternalKey(&c, "0055");
|
|
|
|
|
|
|
|
AddInternalKey(&c, "0056");
|
|
|
|
AddInternalKey(&c, "0057");
|
|
|
|
|
|
|
|
AddInternalKey(&c, "0058");
|
|
|
|
AddInternalKey(&c, "0075");
|
|
|
|
|
|
|
|
AddInternalKey(&c, "0076");
|
|
|
|
AddInternalKey(&c, "0095");
|
|
|
|
|
|
|
|
std::vector<std::string> keys;
|
2015-09-02 20:58:22 +00:00
|
|
|
stl_wrappers::KVMap kvmap;
|
2014-04-10 21:19:43 +00:00
|
|
|
Options options;
|
2014-08-25 21:22:05 +00:00
|
|
|
options.prefix_extractor.reset(NewFixedPrefixTransform(3));
|
|
|
|
table_options.block_size = 1700;
|
2016-04-07 20:51:47 +00:00
|
|
|
table_options.block_cache = NewLRUCache(1024, 4);
|
2014-08-25 21:22:05 +00:00
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
2014-04-10 21:19:43 +00:00
|
|
|
|
|
|
|
std::unique_ptr<InternalKeyComparator> comparator(
|
|
|
|
new InternalKeyComparator(BytewiseComparator()));
|
2021-05-05 20:59:21 +00:00
|
|
|
const ImmutableOptions ioptions(options);
|
2018-05-21 21:33:55 +00:00
|
|
|
const MutableCFOptions moptions(options);
|
|
|
|
c.Finish(options, ioptions, moptions, table_options, *comparator, &keys,
|
|
|
|
&kvmap);
|
2014-08-25 23:14:30 +00:00
|
|
|
auto reader = c.GetTableReader();
|
2014-04-10 21:19:43 +00:00
|
|
|
|
2014-08-25 23:14:30 +00:00
|
|
|
auto props = reader->GetTableProperties();
|
2014-04-10 21:19:43 +00:00
|
|
|
ASSERT_EQ(5u, props->num_data_blocks);
|
|
|
|
|
2018-05-21 21:33:55 +00:00
|
|
|
// TODO(Zhongyi): update test to use MutableCFOptions
|
2020-08-03 22:21:56 +00:00
|
|
|
ReadOptions read_options;
|
2019-06-20 21:28:22 +00:00
|
|
|
std::unique_ptr<InternalIterator> index_iter(reader->NewIterator(
|
2020-08-03 22:21:56 +00:00
|
|
|
read_options, moptions.prefix_extractor.get(), /*arena=*/nullptr,
|
2019-06-20 21:28:22 +00:00
|
|
|
/*skip_filters=*/false, TableReaderCaller::kUncategorized));
|
2014-04-10 21:19:43 +00:00
|
|
|
|
|
|
|
// -- Find keys do not exist, but have common prefix.
|
|
|
|
std::vector<std::string> prefixes = {"001", "003", "005", "007", "009"};
|
2020-03-17 19:28:46 +00:00
|
|
|
std::vector<std::string> lower_bound = {
|
|
|
|
keys[0], keys[1], keys[2], keys[7], keys[9],
|
|
|
|
};
|
2014-04-10 21:19:43 +00:00
|
|
|
|
|
|
|
// find the lower bound of the prefix
|
|
|
|
for (size_t i = 0; i < prefixes.size(); ++i) {
|
2017-02-07 00:29:29 +00:00
|
|
|
index_iter->Seek(InternalKey(prefixes[i], 0, kTypeValue).Encode());
|
|
|
|
ASSERT_OK(index_iter->status());
|
|
|
|
ASSERT_TRUE(index_iter->Valid());
|
2014-04-10 21:19:43 +00:00
|
|
|
|
|
|
|
// seek the first element in the block
|
2017-02-07 00:29:29 +00:00
|
|
|
ASSERT_EQ(lower_bound[i], index_iter->key().ToString());
|
|
|
|
ASSERT_EQ("v", index_iter->value().ToString());
|
2014-04-10 21:19:43 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// find the upper bound of prefixes
|
2022-10-25 18:50:38 +00:00
|
|
|
std::vector<std::string> upper_bound = {
|
|
|
|
keys[1],
|
|
|
|
keys[2],
|
|
|
|
keys[7],
|
|
|
|
keys[9],
|
|
|
|
};
|
2014-04-10 21:19:43 +00:00
|
|
|
|
|
|
|
// find existing keys
|
|
|
|
for (const auto& item : kvmap) {
|
|
|
|
auto ukey = ExtractUserKey(item.first).ToString();
|
2017-02-07 00:29:29 +00:00
|
|
|
index_iter->Seek(ukey);
|
2014-04-10 21:19:43 +00:00
|
|
|
|
|
|
|
// ASSERT_OK(regular_iter->status());
|
2017-02-07 00:29:29 +00:00
|
|
|
ASSERT_OK(index_iter->status());
|
2014-04-10 21:19:43 +00:00
|
|
|
|
|
|
|
// ASSERT_TRUE(regular_iter->Valid());
|
2017-02-07 00:29:29 +00:00
|
|
|
ASSERT_TRUE(index_iter->Valid());
|
2014-04-10 21:19:43 +00:00
|
|
|
|
2017-02-07 00:29:29 +00:00
|
|
|
ASSERT_EQ(item.first, index_iter->key().ToString());
|
|
|
|
ASSERT_EQ(item.second, index_iter->value().ToString());
|
2014-04-10 21:19:43 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
for (size_t i = 0; i < prefixes.size(); ++i) {
|
|
|
|
// the key is greater than any existing keys.
|
|
|
|
auto key = prefixes[i] + "9";
|
2017-02-07 00:29:29 +00:00
|
|
|
index_iter->Seek(InternalKey(key, 0, kTypeValue).Encode());
|
2014-04-10 21:19:43 +00:00
|
|
|
|
2020-01-15 22:03:18 +00:00
|
|
|
ASSERT_TRUE(index_iter->status().ok() || index_iter->status().IsNotFound());
|
|
|
|
ASSERT_TRUE(!index_iter->status().IsNotFound() || !index_iter->Valid());
|
2014-04-10 21:19:43 +00:00
|
|
|
if (i == prefixes.size() - 1) {
|
|
|
|
// last key
|
2017-02-07 00:29:29 +00:00
|
|
|
ASSERT_TRUE(!index_iter->Valid());
|
2014-04-10 21:19:43 +00:00
|
|
|
} else {
|
2017-02-07 00:29:29 +00:00
|
|
|
ASSERT_TRUE(index_iter->Valid());
|
2014-04-10 21:19:43 +00:00
|
|
|
// seek the first element in the block
|
2017-02-07 00:29:29 +00:00
|
|
|
ASSERT_EQ(upper_bound[i], index_iter->key().ToString());
|
|
|
|
ASSERT_EQ("v", index_iter->value().ToString());
|
2014-04-10 21:19:43 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// find keys with prefix that don't match any of the existing prefixes.
|
|
|
|
std::vector<std::string> non_exist_prefixes = {"002", "004", "006", "008"};
|
|
|
|
for (const auto& prefix : non_exist_prefixes) {
|
2017-02-07 00:29:29 +00:00
|
|
|
index_iter->Seek(InternalKey(prefix, 0, kTypeValue).Encode());
|
2014-04-10 21:19:43 +00:00
|
|
|
// regular_iter->Seek(prefix);
|
|
|
|
|
2017-02-07 00:29:29 +00:00
|
|
|
ASSERT_OK(index_iter->status());
|
2014-06-13 02:03:22 +00:00
|
|
|
// Seek to non-existing prefixes should yield either invalid, or a
|
|
|
|
// key with prefix greater than the target.
|
2017-02-07 00:29:29 +00:00
|
|
|
if (index_iter->Valid()) {
|
|
|
|
Slice ukey = ExtractUserKey(index_iter->key());
|
2014-06-13 02:03:22 +00:00
|
|
|
Slice ukey_prefix = options.prefix_extractor->Transform(ukey);
|
|
|
|
ASSERT_TRUE(BytewiseComparator()->Compare(prefix, ukey_prefix) < 0);
|
|
|
|
}
|
2014-04-10 21:19:43 +00:00
|
|
|
}
|
2020-01-15 22:03:18 +00:00
|
|
|
for (const auto& prefix : non_exist_prefixes) {
|
|
|
|
index_iter->SeekForPrev(InternalKey(prefix, 0, kTypeValue).Encode());
|
|
|
|
// regular_iter->Seek(prefix);
|
|
|
|
|
|
|
|
ASSERT_OK(index_iter->status());
|
|
|
|
// Seek to non-existing prefixes should yield either invalid, or a
|
|
|
|
// key with prefix greater than the target.
|
|
|
|
if (index_iter->Valid()) {
|
|
|
|
Slice ukey = ExtractUserKey(index_iter->key());
|
|
|
|
Slice ukey_prefix = options.prefix_extractor->Transform(ukey);
|
|
|
|
ASSERT_TRUE(BytewiseComparator()->Compare(prefix, ukey_prefix) > 0);
|
|
|
|
}
|
|
|
|
}
|
2020-03-17 19:28:46 +00:00
|
|
|
|
|
|
|
{
|
|
|
|
// Test reseek case. It should impact partitioned index more.
|
|
|
|
ReadOptions ro;
|
|
|
|
ro.total_order_seek = true;
|
|
|
|
std::unique_ptr<InternalIterator> index_iter2(reader->NewIterator(
|
|
|
|
ro, moptions.prefix_extractor.get(), /*arena=*/nullptr,
|
|
|
|
/*skip_filters=*/false, TableReaderCaller::kUncategorized));
|
|
|
|
|
|
|
|
// Things to cover in partitioned index:
|
|
|
|
// 1. Both of Seek() and SeekToLast() has optimization to prevent
|
|
|
|
// rereek leaf index block if it remains to the same one, and
|
|
|
|
// they reuse the same variable.
|
|
|
|
// 2. When Next() or Prev() is called, the block moves, so the
|
|
|
|
// optimization should kick in only with the current one.
|
|
|
|
index_iter2->Seek(InternalKey("0055", 0, kTypeValue).Encode());
|
|
|
|
ASSERT_TRUE(index_iter2->Valid());
|
|
|
|
ASSERT_EQ("0055", index_iter2->key().ToString().substr(0, 4));
|
|
|
|
|
|
|
|
index_iter2->SeekToLast();
|
|
|
|
ASSERT_TRUE(index_iter2->Valid());
|
|
|
|
ASSERT_EQ("0095", index_iter2->key().ToString().substr(0, 4));
|
|
|
|
|
|
|
|
index_iter2->Seek(InternalKey("0055", 0, kTypeValue).Encode());
|
|
|
|
ASSERT_TRUE(index_iter2->Valid());
|
|
|
|
ASSERT_EQ("0055", index_iter2->key().ToString().substr(0, 4));
|
|
|
|
|
|
|
|
index_iter2->SeekToLast();
|
|
|
|
ASSERT_TRUE(index_iter2->Valid());
|
|
|
|
ASSERT_EQ("0095", index_iter2->key().ToString().substr(0, 4));
|
|
|
|
index_iter2->Prev();
|
|
|
|
ASSERT_TRUE(index_iter2->Valid());
|
|
|
|
index_iter2->Prev();
|
|
|
|
ASSERT_TRUE(index_iter2->Valid());
|
|
|
|
ASSERT_EQ("0075", index_iter2->key().ToString().substr(0, 4));
|
|
|
|
|
|
|
|
index_iter2->Seek(InternalKey("0095", 0, kTypeValue).Encode());
|
|
|
|
ASSERT_TRUE(index_iter2->Valid());
|
|
|
|
ASSERT_EQ("0095", index_iter2->key().ToString().substr(0, 4));
|
|
|
|
index_iter2->Prev();
|
|
|
|
ASSERT_TRUE(index_iter2->Valid());
|
|
|
|
index_iter2->Prev();
|
|
|
|
ASSERT_TRUE(index_iter2->Valid());
|
|
|
|
ASSERT_EQ("0075", index_iter2->key().ToString().substr(0, 4));
|
|
|
|
|
|
|
|
index_iter2->SeekToLast();
|
|
|
|
ASSERT_TRUE(index_iter2->Valid());
|
|
|
|
ASSERT_EQ("0095", index_iter2->key().ToString().substr(0, 4));
|
|
|
|
|
|
|
|
index_iter2->Seek(InternalKey("0095", 0, kTypeValue).Encode());
|
|
|
|
ASSERT_TRUE(index_iter2->Valid());
|
|
|
|
ASSERT_EQ("0095", index_iter2->key().ToString().substr(0, 4));
|
|
|
|
|
|
|
|
index_iter2->Prev();
|
|
|
|
ASSERT_TRUE(index_iter2->Valid());
|
|
|
|
index_iter2->Prev();
|
|
|
|
ASSERT_TRUE(index_iter2->Valid());
|
|
|
|
ASSERT_EQ("0075", index_iter2->key().ToString().substr(0, 4));
|
|
|
|
|
|
|
|
index_iter2->Seek(InternalKey("0075", 0, kTypeValue).Encode());
|
|
|
|
ASSERT_TRUE(index_iter2->Valid());
|
|
|
|
ASSERT_EQ("0075", index_iter2->key().ToString().substr(0, 4));
|
|
|
|
|
|
|
|
index_iter2->Next();
|
|
|
|
ASSERT_TRUE(index_iter2->Valid());
|
|
|
|
index_iter2->Next();
|
|
|
|
ASSERT_TRUE(index_iter2->Valid());
|
|
|
|
ASSERT_EQ("0095", index_iter2->key().ToString().substr(0, 4));
|
|
|
|
|
|
|
|
index_iter2->SeekToLast();
|
|
|
|
ASSERT_TRUE(index_iter2->Valid());
|
|
|
|
ASSERT_EQ("0095", index_iter2->key().ToString().substr(0, 4));
|
|
|
|
}
|
|
|
|
|
Adding pin_l0_filter_and_index_blocks_in_cache feature and related fixes.
Summary:
When a block based table file is opened, if prefetch_index_and_filter is true, it will prefetch the index and filter blocks, putting them into the block cache.
What this feature adds: when a L0 block based table file is opened, if pin_l0_filter_and_index_blocks_in_cache is true in the options (and prefetch_index_and_filter is true), then the filter and index blocks aren't released back to the block cache at the end of BlockBasedTableReader::Open(). Instead the table reader takes ownership of them, hence pinning them, ie. the LRU cache will never push them out. Meanwhile in the table reader, further accesses will not hit the block cache, thus avoiding lock contention.
Test Plan:
'export TEST_TMPDIR=/dev/shm/ && DISABLE_JEMALLOC=1 OPT=-g make all valgrind_check -j32' is OK.
I didn't run the Java tests, I don't have Java set up on my devserver.
Reviewers: sdong
Reviewed By: sdong
Subscribers: andrewkr, dhruba
Differential Revision: https://reviews.facebook.net/D56133
2016-04-01 17:42:39 +00:00
|
|
|
c.ResetTableReader();
|
2014-04-10 21:19:43 +00:00
|
|
|
}
|
|
|
|
|
2018-06-05 02:59:44 +00:00
|
|
|
TEST_P(BlockBasedTableTest, BinaryIndexTest) {
|
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
2017-02-07 00:29:29 +00:00
|
|
|
table_options.index_type = BlockBasedTableOptions::kBinarySearch;
|
|
|
|
IndexTest(table_options);
|
|
|
|
}
|
|
|
|
|
2018-06-05 02:59:44 +00:00
|
|
|
TEST_P(BlockBasedTableTest, HashIndexTest) {
|
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
2017-02-07 00:29:29 +00:00
|
|
|
table_options.index_type = BlockBasedTableOptions::kHashSearch;
|
|
|
|
IndexTest(table_options);
|
|
|
|
}
|
|
|
|
|
2018-06-05 02:59:44 +00:00
|
|
|
TEST_P(BlockBasedTableTest, PartitionIndexTest) {
|
2017-02-07 00:29:29 +00:00
|
|
|
const int max_index_keys = 5;
|
2017-03-28 18:56:56 +00:00
|
|
|
const int est_max_index_key_value_size = 32;
|
|
|
|
const int est_max_index_size = max_index_keys * est_max_index_key_value_size;
|
|
|
|
for (int i = 1; i <= est_max_index_size + 1; i++) {
|
2018-06-05 02:59:44 +00:00
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
2017-02-07 00:29:29 +00:00
|
|
|
table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch;
|
2017-03-28 18:56:56 +00:00
|
|
|
table_options.metadata_block_size = i;
|
2017-02-07 00:29:29 +00:00
|
|
|
IndexTest(table_options);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-05-10 19:36:40 +00:00
|
|
|
TEST_P(BlockBasedTableTest, IndexSeekOptimizationIncomplete) {
|
|
|
|
std::unique_ptr<InternalKeyComparator> comparator(
|
|
|
|
new InternalKeyComparator(BytewiseComparator()));
|
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
|
|
|
Options options;
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
2021-05-05 20:59:21 +00:00
|
|
|
const ImmutableOptions ioptions(options);
|
2019-05-10 19:36:40 +00:00
|
|
|
const MutableCFOptions moptions(options);
|
|
|
|
|
|
|
|
TableConstructor c(BytewiseComparator());
|
|
|
|
AddInternalKey(&c, "pika");
|
|
|
|
|
|
|
|
std::vector<std::string> keys;
|
|
|
|
stl_wrappers::KVMap kvmap;
|
|
|
|
c.Finish(options, ioptions, moptions, table_options, *comparator, &keys,
|
|
|
|
&kvmap);
|
|
|
|
ASSERT_EQ(1, keys.size());
|
|
|
|
|
|
|
|
auto reader = c.GetTableReader();
|
|
|
|
ReadOptions ropt;
|
|
|
|
ropt.read_tier = ReadTier::kBlockCacheTier;
|
2019-06-20 21:28:22 +00:00
|
|
|
std::unique_ptr<InternalIterator> iter(reader->NewIterator(
|
|
|
|
ropt, /*prefix_extractor=*/nullptr, /*arena=*/nullptr,
|
|
|
|
/*skip_filters=*/false, TableReaderCaller::kUncategorized));
|
2019-05-10 19:36:40 +00:00
|
|
|
|
|
|
|
auto ikey = [](Slice user_key) {
|
|
|
|
return InternalKey(user_key, 0, kTypeValue).Encode().ToString();
|
|
|
|
};
|
|
|
|
|
|
|
|
iter->Seek(ikey("pika"));
|
|
|
|
ASSERT_FALSE(iter->Valid());
|
|
|
|
ASSERT_TRUE(iter->status().IsIncomplete());
|
|
|
|
|
|
|
|
// This used to crash at some point.
|
|
|
|
iter->Seek(ikey("pika"));
|
|
|
|
ASSERT_FALSE(iter->Valid());
|
|
|
|
ASSERT_TRUE(iter->status().IsIncomplete());
|
|
|
|
}
|
|
|
|
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-25 03:50:35 +00:00
|
|
|
TEST_P(BlockBasedTableTest, BinaryIndexWithFirstKey1) {
|
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
|
|
|
table_options.index_type = BlockBasedTableOptions::kBinarySearchWithFirstKey;
|
|
|
|
IndexTest(table_options);
|
|
|
|
}
|
|
|
|
|
|
|
|
class CustomFlushBlockPolicy : public FlushBlockPolicyFactory,
|
|
|
|
public FlushBlockPolicy {
|
|
|
|
public:
|
|
|
|
explicit CustomFlushBlockPolicy(std::vector<int> keys_per_block)
|
|
|
|
: keys_per_block_(keys_per_block) {}
|
|
|
|
|
2020-10-01 22:21:17 +00:00
|
|
|
const char* Name() const override { return "CustomFlushBlockPolicy"; }
|
|
|
|
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-25 03:50:35 +00:00
|
|
|
FlushBlockPolicy* NewFlushBlockPolicy(const BlockBasedTableOptions&,
|
|
|
|
const BlockBuilder&) const override {
|
|
|
|
return new CustomFlushBlockPolicy(keys_per_block_);
|
|
|
|
}
|
|
|
|
|
|
|
|
bool Update(const Slice&, const Slice&) override {
|
|
|
|
if (keys_in_current_block_ >= keys_per_block_.at(current_block_idx_)) {
|
|
|
|
++current_block_idx_;
|
|
|
|
keys_in_current_block_ = 1;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
++keys_in_current_block_;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<int> keys_per_block_;
|
|
|
|
|
|
|
|
int current_block_idx_ = 0;
|
|
|
|
int keys_in_current_block_ = 0;
|
|
|
|
};
|
|
|
|
|
|
|
|
TEST_P(BlockBasedTableTest, BinaryIndexWithFirstKey2) {
|
|
|
|
for (int use_first_key = 0; use_first_key < 2; ++use_first_key) {
|
|
|
|
SCOPED_TRACE("use_first_key = " + std::to_string(use_first_key));
|
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
|
|
|
table_options.index_type =
|
|
|
|
use_first_key ? BlockBasedTableOptions::kBinarySearchWithFirstKey
|
|
|
|
: BlockBasedTableOptions::kBinarySearch;
|
|
|
|
table_options.block_cache = NewLRUCache(10000); // fits all blocks
|
|
|
|
table_options.index_shortening =
|
|
|
|
BlockBasedTableOptions::IndexShorteningMode::kNoShortening;
|
|
|
|
table_options.flush_block_policy_factory =
|
|
|
|
std::make_shared<CustomFlushBlockPolicy>(std::vector<int>{2, 1, 3, 2});
|
|
|
|
Options options;
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
options.statistics = CreateDBStatistics();
|
|
|
|
Statistics* stats = options.statistics.get();
|
|
|
|
std::unique_ptr<InternalKeyComparator> comparator(
|
|
|
|
new InternalKeyComparator(BytewiseComparator()));
|
2021-05-05 20:59:21 +00:00
|
|
|
const ImmutableOptions ioptions(options);
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-25 03:50:35 +00:00
|
|
|
const MutableCFOptions moptions(options);
|
|
|
|
|
|
|
|
TableConstructor c(BytewiseComparator());
|
|
|
|
|
|
|
|
// Block 0.
|
|
|
|
AddInternalKey(&c, "aaaa", "v0");
|
|
|
|
AddInternalKey(&c, "aaac", "v1");
|
|
|
|
|
|
|
|
// Block 1.
|
|
|
|
AddInternalKey(&c, "aaca", "v2");
|
|
|
|
|
|
|
|
// Block 2.
|
|
|
|
AddInternalKey(&c, "caaa", "v3");
|
|
|
|
AddInternalKey(&c, "caac", "v4");
|
|
|
|
AddInternalKey(&c, "caae", "v5");
|
|
|
|
|
|
|
|
// Block 3.
|
|
|
|
AddInternalKey(&c, "ccaa", "v6");
|
|
|
|
AddInternalKey(&c, "ccac", "v7");
|
|
|
|
|
|
|
|
// Write the file.
|
|
|
|
std::vector<std::string> keys;
|
|
|
|
stl_wrappers::KVMap kvmap;
|
|
|
|
c.Finish(options, ioptions, moptions, table_options, *comparator, &keys,
|
|
|
|
&kvmap);
|
|
|
|
ASSERT_EQ(8, keys.size());
|
|
|
|
|
|
|
|
auto reader = c.GetTableReader();
|
|
|
|
auto props = reader->GetTableProperties();
|
|
|
|
ASSERT_EQ(4u, props->num_data_blocks);
|
2020-08-03 22:21:56 +00:00
|
|
|
ReadOptions read_options;
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-25 03:50:35 +00:00
|
|
|
std::unique_ptr<InternalIterator> iter(reader->NewIterator(
|
2020-08-03 22:21:56 +00:00
|
|
|
read_options, /*prefix_extractor=*/nullptr, /*arena=*/nullptr,
|
Properly report IO errors when IndexType::kBinarySearchWithFirstKey is used (#6621)
Summary:
Context: Index type `kBinarySearchWithFirstKey` added the ability for sst file iterator to sometimes report a key from index without reading the corresponding data block. This is useful when sst blocks are cut at some meaningful boundaries (e.g. one block per key prefix), and many seeks land between blocks (e.g. for each prefix, the ranges of keys in different sst files are nearly disjoint, so a typical seek needs to read a data block from only one file even if all files have the prefix). But this added a new error condition, which rocksdb code was really not equipped to deal with: `InternalIterator::value()` may fail with an IO error or Status::Incomplete, but it's just a method returning a Slice, with no way to report error instead. Before this PR, this type of error wasn't handled at all (an empty slice was returned), and kBinarySearchWithFirstKey implementation was considered a prototype.
Now that we (LogDevice) have experimented with kBinarySearchWithFirstKey for a while and confirmed that it's really useful, this PR is adding the missing error handling.
It's a pretty inconvenient situation implementation-wise. The error needs to be reported from InternalIterator when trying to access value. But there are ~700 call sites of `InternalIterator::value()`, most of which either can't hit the error condition (because the iterator is reading from memtable or from index or something) or wouldn't benefit from the deferred loading of the value (e.g. compaction iterator that reads all values anyway). Adding error handling to all these call sites would needlessly bloat the code. So instead I made the deferred value loading optional: only the call sites that may use deferred loading have to call the new method `PrepareValue()` before calling `value()`. The feature is enabled with a new bool argument `allow_unprepared_value` to a bunch of methods that create iterators (it wouldn't make sense to put it in ReadOptions because it's completely internal to iterators, with virtually no user-visible effect). Lmk if you have better ideas.
Note that the deferred value loading only happens for *internal* iterators. The user-visible iterator (DBIter) always prepares the value before returning from Seek/Next/etc. We could go further and add an API to defer that value loading too, but that's most likely not useful for LogDevice, so it doesn't seem worth the complexity for now.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6621
Test Plan: make -j5 check . Will also deploy to some logdevice test clusters and look at stats.
Reviewed By: siying
Differential Revision: D20786930
Pulled By: al13n321
fbshipit-source-id: 6da77d918bad3780522e918f17f4d5513d3e99ee
2020-04-16 00:37:23 +00:00
|
|
|
/*skip_filters=*/false, TableReaderCaller::kUncategorized,
|
|
|
|
/*compaction_readahead_size=*/0, /*allow_unprepared_value=*/true));
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-25 03:50:35 +00:00
|
|
|
|
|
|
|
// Shouldn't have read data blocks before iterator is seeked.
|
|
|
|
EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
|
|
|
|
EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
|
|
|
|
|
|
|
|
auto ikey = [](Slice user_key) {
|
|
|
|
return InternalKey(user_key, 0, kTypeValue).Encode().ToString();
|
|
|
|
};
|
|
|
|
|
|
|
|
// Seek to a key between blocks. If index contains first key, we shouldn't
|
|
|
|
// read any data blocks until value is requested.
|
|
|
|
iter->Seek(ikey("aaba"));
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
EXPECT_EQ(keys[2], iter->key().ToString());
|
|
|
|
EXPECT_EQ(use_first_key ? 0 : 1,
|
|
|
|
stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
|
Properly report IO errors when IndexType::kBinarySearchWithFirstKey is used (#6621)
Summary:
Context: Index type `kBinarySearchWithFirstKey` added the ability for sst file iterator to sometimes report a key from index without reading the corresponding data block. This is useful when sst blocks are cut at some meaningful boundaries (e.g. one block per key prefix), and many seeks land between blocks (e.g. for each prefix, the ranges of keys in different sst files are nearly disjoint, so a typical seek needs to read a data block from only one file even if all files have the prefix). But this added a new error condition, which rocksdb code was really not equipped to deal with: `InternalIterator::value()` may fail with an IO error or Status::Incomplete, but it's just a method returning a Slice, with no way to report error instead. Before this PR, this type of error wasn't handled at all (an empty slice was returned), and kBinarySearchWithFirstKey implementation was considered a prototype.
Now that we (LogDevice) have experimented with kBinarySearchWithFirstKey for a while and confirmed that it's really useful, this PR is adding the missing error handling.
It's a pretty inconvenient situation implementation-wise. The error needs to be reported from InternalIterator when trying to access value. But there are ~700 call sites of `InternalIterator::value()`, most of which either can't hit the error condition (because the iterator is reading from memtable or from index or something) or wouldn't benefit from the deferred loading of the value (e.g. compaction iterator that reads all values anyway). Adding error handling to all these call sites would needlessly bloat the code. So instead I made the deferred value loading optional: only the call sites that may use deferred loading have to call the new method `PrepareValue()` before calling `value()`. The feature is enabled with a new bool argument `allow_unprepared_value` to a bunch of methods that create iterators (it wouldn't make sense to put it in ReadOptions because it's completely internal to iterators, with virtually no user-visible effect). Lmk if you have better ideas.
Note that the deferred value loading only happens for *internal* iterators. The user-visible iterator (DBIter) always prepares the value before returning from Seek/Next/etc. We could go further and add an API to defer that value loading too, but that's most likely not useful for LogDevice, so it doesn't seem worth the complexity for now.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6621
Test Plan: make -j5 check . Will also deploy to some logdevice test clusters and look at stats.
Reviewed By: siying
Differential Revision: D20786930
Pulled By: al13n321
fbshipit-source-id: 6da77d918bad3780522e918f17f4d5513d3e99ee
2020-04-16 00:37:23 +00:00
|
|
|
ASSERT_TRUE(iter->PrepareValue());
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-25 03:50:35 +00:00
|
|
|
EXPECT_EQ("v2", iter->value().ToString());
|
|
|
|
EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
|
|
|
|
EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
|
|
|
|
|
|
|
|
// Seek to the middle of a block. The block should be read right away.
|
|
|
|
iter->Seek(ikey("caab"));
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
EXPECT_EQ(keys[4], iter->key().ToString());
|
|
|
|
EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
|
|
|
|
EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
|
Properly report IO errors when IndexType::kBinarySearchWithFirstKey is used (#6621)
Summary:
Context: Index type `kBinarySearchWithFirstKey` added the ability for sst file iterator to sometimes report a key from index without reading the corresponding data block. This is useful when sst blocks are cut at some meaningful boundaries (e.g. one block per key prefix), and many seeks land between blocks (e.g. for each prefix, the ranges of keys in different sst files are nearly disjoint, so a typical seek needs to read a data block from only one file even if all files have the prefix). But this added a new error condition, which rocksdb code was really not equipped to deal with: `InternalIterator::value()` may fail with an IO error or Status::Incomplete, but it's just a method returning a Slice, with no way to report error instead. Before this PR, this type of error wasn't handled at all (an empty slice was returned), and kBinarySearchWithFirstKey implementation was considered a prototype.
Now that we (LogDevice) have experimented with kBinarySearchWithFirstKey for a while and confirmed that it's really useful, this PR is adding the missing error handling.
It's a pretty inconvenient situation implementation-wise. The error needs to be reported from InternalIterator when trying to access value. But there are ~700 call sites of `InternalIterator::value()`, most of which either can't hit the error condition (because the iterator is reading from memtable or from index or something) or wouldn't benefit from the deferred loading of the value (e.g. compaction iterator that reads all values anyway). Adding error handling to all these call sites would needlessly bloat the code. So instead I made the deferred value loading optional: only the call sites that may use deferred loading have to call the new method `PrepareValue()` before calling `value()`. The feature is enabled with a new bool argument `allow_unprepared_value` to a bunch of methods that create iterators (it wouldn't make sense to put it in ReadOptions because it's completely internal to iterators, with virtually no user-visible effect). Lmk if you have better ideas.
Note that the deferred value loading only happens for *internal* iterators. The user-visible iterator (DBIter) always prepares the value before returning from Seek/Next/etc. We could go further and add an API to defer that value loading too, but that's most likely not useful for LogDevice, so it doesn't seem worth the complexity for now.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6621
Test Plan: make -j5 check . Will also deploy to some logdevice test clusters and look at stats.
Reviewed By: siying
Differential Revision: D20786930
Pulled By: al13n321
fbshipit-source-id: 6da77d918bad3780522e918f17f4d5513d3e99ee
2020-04-16 00:37:23 +00:00
|
|
|
ASSERT_TRUE(iter->PrepareValue());
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-25 03:50:35 +00:00
|
|
|
EXPECT_EQ("v4", iter->value().ToString());
|
|
|
|
EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
|
|
|
|
|
|
|
|
// Seek to just before the same block and don't access value.
|
|
|
|
// The iterator should keep pinning the block contents.
|
|
|
|
iter->Seek(ikey("baaa"));
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
EXPECT_EQ(keys[3], iter->key().ToString());
|
|
|
|
EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
|
|
|
|
|
|
|
|
// Seek to the same block again to check that the block is still pinned.
|
|
|
|
iter->Seek(ikey("caae"));
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
EXPECT_EQ(keys[5], iter->key().ToString());
|
|
|
|
EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
|
Properly report IO errors when IndexType::kBinarySearchWithFirstKey is used (#6621)
Summary:
Context: Index type `kBinarySearchWithFirstKey` added the ability for sst file iterator to sometimes report a key from index without reading the corresponding data block. This is useful when sst blocks are cut at some meaningful boundaries (e.g. one block per key prefix), and many seeks land between blocks (e.g. for each prefix, the ranges of keys in different sst files are nearly disjoint, so a typical seek needs to read a data block from only one file even if all files have the prefix). But this added a new error condition, which rocksdb code was really not equipped to deal with: `InternalIterator::value()` may fail with an IO error or Status::Incomplete, but it's just a method returning a Slice, with no way to report error instead. Before this PR, this type of error wasn't handled at all (an empty slice was returned), and kBinarySearchWithFirstKey implementation was considered a prototype.
Now that we (LogDevice) have experimented with kBinarySearchWithFirstKey for a while and confirmed that it's really useful, this PR is adding the missing error handling.
It's a pretty inconvenient situation implementation-wise. The error needs to be reported from InternalIterator when trying to access value. But there are ~700 call sites of `InternalIterator::value()`, most of which either can't hit the error condition (because the iterator is reading from memtable or from index or something) or wouldn't benefit from the deferred loading of the value (e.g. compaction iterator that reads all values anyway). Adding error handling to all these call sites would needlessly bloat the code. So instead I made the deferred value loading optional: only the call sites that may use deferred loading have to call the new method `PrepareValue()` before calling `value()`. The feature is enabled with a new bool argument `allow_unprepared_value` to a bunch of methods that create iterators (it wouldn't make sense to put it in ReadOptions because it's completely internal to iterators, with virtually no user-visible effect). Lmk if you have better ideas.
Note that the deferred value loading only happens for *internal* iterators. The user-visible iterator (DBIter) always prepares the value before returning from Seek/Next/etc. We could go further and add an API to defer that value loading too, but that's most likely not useful for LogDevice, so it doesn't seem worth the complexity for now.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6621
Test Plan: make -j5 check . Will also deploy to some logdevice test clusters and look at stats.
Reviewed By: siying
Differential Revision: D20786930
Pulled By: al13n321
fbshipit-source-id: 6da77d918bad3780522e918f17f4d5513d3e99ee
2020-04-16 00:37:23 +00:00
|
|
|
ASSERT_TRUE(iter->PrepareValue());
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-25 03:50:35 +00:00
|
|
|
EXPECT_EQ("v5", iter->value().ToString());
|
|
|
|
EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
|
|
|
|
EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
|
|
|
|
|
|
|
|
// Step forward and fall through to the next block. Don't access value.
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
EXPECT_EQ(keys[6], iter->key().ToString());
|
|
|
|
EXPECT_EQ(use_first_key ? 2 : 3,
|
|
|
|
stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
|
|
|
|
EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
|
|
|
|
|
|
|
|
// Step forward again. Block should be read.
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
EXPECT_EQ(keys[7], iter->key().ToString());
|
|
|
|
EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
|
Properly report IO errors when IndexType::kBinarySearchWithFirstKey is used (#6621)
Summary:
Context: Index type `kBinarySearchWithFirstKey` added the ability for sst file iterator to sometimes report a key from index without reading the corresponding data block. This is useful when sst blocks are cut at some meaningful boundaries (e.g. one block per key prefix), and many seeks land between blocks (e.g. for each prefix, the ranges of keys in different sst files are nearly disjoint, so a typical seek needs to read a data block from only one file even if all files have the prefix). But this added a new error condition, which rocksdb code was really not equipped to deal with: `InternalIterator::value()` may fail with an IO error or Status::Incomplete, but it's just a method returning a Slice, with no way to report error instead. Before this PR, this type of error wasn't handled at all (an empty slice was returned), and kBinarySearchWithFirstKey implementation was considered a prototype.
Now that we (LogDevice) have experimented with kBinarySearchWithFirstKey for a while and confirmed that it's really useful, this PR is adding the missing error handling.
It's a pretty inconvenient situation implementation-wise. The error needs to be reported from InternalIterator when trying to access value. But there are ~700 call sites of `InternalIterator::value()`, most of which either can't hit the error condition (because the iterator is reading from memtable or from index or something) or wouldn't benefit from the deferred loading of the value (e.g. compaction iterator that reads all values anyway). Adding error handling to all these call sites would needlessly bloat the code. So instead I made the deferred value loading optional: only the call sites that may use deferred loading have to call the new method `PrepareValue()` before calling `value()`. The feature is enabled with a new bool argument `allow_unprepared_value` to a bunch of methods that create iterators (it wouldn't make sense to put it in ReadOptions because it's completely internal to iterators, with virtually no user-visible effect). Lmk if you have better ideas.
Note that the deferred value loading only happens for *internal* iterators. The user-visible iterator (DBIter) always prepares the value before returning from Seek/Next/etc. We could go further and add an API to defer that value loading too, but that's most likely not useful for LogDevice, so it doesn't seem worth the complexity for now.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6621
Test Plan: make -j5 check . Will also deploy to some logdevice test clusters and look at stats.
Reviewed By: siying
Differential Revision: D20786930
Pulled By: al13n321
fbshipit-source-id: 6da77d918bad3780522e918f17f4d5513d3e99ee
2020-04-16 00:37:23 +00:00
|
|
|
ASSERT_TRUE(iter->PrepareValue());
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-25 03:50:35 +00:00
|
|
|
EXPECT_EQ("v7", iter->value().ToString());
|
|
|
|
EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
|
|
|
|
|
|
|
|
// Step forward and reach the end.
|
|
|
|
iter->Next();
|
|
|
|
EXPECT_FALSE(iter->Valid());
|
|
|
|
EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
|
|
|
|
EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
|
|
|
|
|
|
|
|
// Seek to a single-key block and step forward without accessing value.
|
|
|
|
iter->Seek(ikey("aaca"));
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
EXPECT_EQ(keys[2], iter->key().ToString());
|
|
|
|
EXPECT_EQ(use_first_key ? 0 : 1,
|
|
|
|
stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
|
|
|
|
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
EXPECT_EQ(keys[3], iter->key().ToString());
|
|
|
|
EXPECT_EQ(use_first_key ? 1 : 2,
|
|
|
|
stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
|
Properly report IO errors when IndexType::kBinarySearchWithFirstKey is used (#6621)
Summary:
Context: Index type `kBinarySearchWithFirstKey` added the ability for sst file iterator to sometimes report a key from index without reading the corresponding data block. This is useful when sst blocks are cut at some meaningful boundaries (e.g. one block per key prefix), and many seeks land between blocks (e.g. for each prefix, the ranges of keys in different sst files are nearly disjoint, so a typical seek needs to read a data block from only one file even if all files have the prefix). But this added a new error condition, which rocksdb code was really not equipped to deal with: `InternalIterator::value()` may fail with an IO error or Status::Incomplete, but it's just a method returning a Slice, with no way to report error instead. Before this PR, this type of error wasn't handled at all (an empty slice was returned), and kBinarySearchWithFirstKey implementation was considered a prototype.
Now that we (LogDevice) have experimented with kBinarySearchWithFirstKey for a while and confirmed that it's really useful, this PR is adding the missing error handling.
It's a pretty inconvenient situation implementation-wise. The error needs to be reported from InternalIterator when trying to access value. But there are ~700 call sites of `InternalIterator::value()`, most of which either can't hit the error condition (because the iterator is reading from memtable or from index or something) or wouldn't benefit from the deferred loading of the value (e.g. compaction iterator that reads all values anyway). Adding error handling to all these call sites would needlessly bloat the code. So instead I made the deferred value loading optional: only the call sites that may use deferred loading have to call the new method `PrepareValue()` before calling `value()`. The feature is enabled with a new bool argument `allow_unprepared_value` to a bunch of methods that create iterators (it wouldn't make sense to put it in ReadOptions because it's completely internal to iterators, with virtually no user-visible effect). Lmk if you have better ideas.
Note that the deferred value loading only happens for *internal* iterators. The user-visible iterator (DBIter) always prepares the value before returning from Seek/Next/etc. We could go further and add an API to defer that value loading too, but that's most likely not useful for LogDevice, so it doesn't seem worth the complexity for now.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6621
Test Plan: make -j5 check . Will also deploy to some logdevice test clusters and look at stats.
Reviewed By: siying
Differential Revision: D20786930
Pulled By: al13n321
fbshipit-source-id: 6da77d918bad3780522e918f17f4d5513d3e99ee
2020-04-16 00:37:23 +00:00
|
|
|
ASSERT_TRUE(iter->PrepareValue());
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-25 03:50:35 +00:00
|
|
|
EXPECT_EQ("v3", iter->value().ToString());
|
|
|
|
EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
|
|
|
|
EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
|
|
|
|
|
|
|
|
// Seek between blocks and step back without accessing value.
|
|
|
|
iter->Seek(ikey("aaca"));
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
EXPECT_EQ(keys[2], iter->key().ToString());
|
|
|
|
EXPECT_EQ(use_first_key ? 2 : 3,
|
|
|
|
stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
|
|
|
|
EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
|
|
|
|
|
|
|
|
iter->Prev();
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
EXPECT_EQ(keys[1], iter->key().ToString());
|
|
|
|
EXPECT_EQ(use_first_key ? 2 : 3,
|
|
|
|
stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
|
|
|
|
// All blocks are in cache now, there'll be no more misses ever.
|
|
|
|
EXPECT_EQ(4, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
|
Properly report IO errors when IndexType::kBinarySearchWithFirstKey is used (#6621)
Summary:
Context: Index type `kBinarySearchWithFirstKey` added the ability for sst file iterator to sometimes report a key from index without reading the corresponding data block. This is useful when sst blocks are cut at some meaningful boundaries (e.g. one block per key prefix), and many seeks land between blocks (e.g. for each prefix, the ranges of keys in different sst files are nearly disjoint, so a typical seek needs to read a data block from only one file even if all files have the prefix). But this added a new error condition, which rocksdb code was really not equipped to deal with: `InternalIterator::value()` may fail with an IO error or Status::Incomplete, but it's just a method returning a Slice, with no way to report error instead. Before this PR, this type of error wasn't handled at all (an empty slice was returned), and kBinarySearchWithFirstKey implementation was considered a prototype.
Now that we (LogDevice) have experimented with kBinarySearchWithFirstKey for a while and confirmed that it's really useful, this PR is adding the missing error handling.
It's a pretty inconvenient situation implementation-wise. The error needs to be reported from InternalIterator when trying to access value. But there are ~700 call sites of `InternalIterator::value()`, most of which either can't hit the error condition (because the iterator is reading from memtable or from index or something) or wouldn't benefit from the deferred loading of the value (e.g. compaction iterator that reads all values anyway). Adding error handling to all these call sites would needlessly bloat the code. So instead I made the deferred value loading optional: only the call sites that may use deferred loading have to call the new method `PrepareValue()` before calling `value()`. The feature is enabled with a new bool argument `allow_unprepared_value` to a bunch of methods that create iterators (it wouldn't make sense to put it in ReadOptions because it's completely internal to iterators, with virtually no user-visible effect). Lmk if you have better ideas.
Note that the deferred value loading only happens for *internal* iterators. The user-visible iterator (DBIter) always prepares the value before returning from Seek/Next/etc. We could go further and add an API to defer that value loading too, but that's most likely not useful for LogDevice, so it doesn't seem worth the complexity for now.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6621
Test Plan: make -j5 check . Will also deploy to some logdevice test clusters and look at stats.
Reviewed By: siying
Differential Revision: D20786930
Pulled By: al13n321
fbshipit-source-id: 6da77d918bad3780522e918f17f4d5513d3e99ee
2020-04-16 00:37:23 +00:00
|
|
|
ASSERT_TRUE(iter->PrepareValue());
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-25 03:50:35 +00:00
|
|
|
EXPECT_EQ("v1", iter->value().ToString());
|
|
|
|
|
|
|
|
// Next into the next block again.
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
EXPECT_EQ(keys[2], iter->key().ToString());
|
|
|
|
EXPECT_EQ(use_first_key ? 2 : 4,
|
|
|
|
stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
|
|
|
|
|
|
|
|
// Seek to first and step back without accessing value.
|
|
|
|
iter->SeekToFirst();
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
EXPECT_EQ(keys[0], iter->key().ToString());
|
|
|
|
EXPECT_EQ(use_first_key ? 2 : 5,
|
|
|
|
stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
|
|
|
|
|
|
|
|
iter->Prev();
|
|
|
|
EXPECT_FALSE(iter->Valid());
|
|
|
|
EXPECT_EQ(use_first_key ? 2 : 5,
|
|
|
|
stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
|
|
|
|
|
|
|
|
// Do some SeekForPrev() and SeekToLast() just to cover all methods.
|
|
|
|
iter->SeekForPrev(ikey("caad"));
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
EXPECT_EQ(keys[4], iter->key().ToString());
|
|
|
|
EXPECT_EQ(use_first_key ? 3 : 6,
|
|
|
|
stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
|
Properly report IO errors when IndexType::kBinarySearchWithFirstKey is used (#6621)
Summary:
Context: Index type `kBinarySearchWithFirstKey` added the ability for sst file iterator to sometimes report a key from index without reading the corresponding data block. This is useful when sst blocks are cut at some meaningful boundaries (e.g. one block per key prefix), and many seeks land between blocks (e.g. for each prefix, the ranges of keys in different sst files are nearly disjoint, so a typical seek needs to read a data block from only one file even if all files have the prefix). But this added a new error condition, which rocksdb code was really not equipped to deal with: `InternalIterator::value()` may fail with an IO error or Status::Incomplete, but it's just a method returning a Slice, with no way to report error instead. Before this PR, this type of error wasn't handled at all (an empty slice was returned), and kBinarySearchWithFirstKey implementation was considered a prototype.
Now that we (LogDevice) have experimented with kBinarySearchWithFirstKey for a while and confirmed that it's really useful, this PR is adding the missing error handling.
It's a pretty inconvenient situation implementation-wise. The error needs to be reported from InternalIterator when trying to access value. But there are ~700 call sites of `InternalIterator::value()`, most of which either can't hit the error condition (because the iterator is reading from memtable or from index or something) or wouldn't benefit from the deferred loading of the value (e.g. compaction iterator that reads all values anyway). Adding error handling to all these call sites would needlessly bloat the code. So instead I made the deferred value loading optional: only the call sites that may use deferred loading have to call the new method `PrepareValue()` before calling `value()`. The feature is enabled with a new bool argument `allow_unprepared_value` to a bunch of methods that create iterators (it wouldn't make sense to put it in ReadOptions because it's completely internal to iterators, with virtually no user-visible effect). Lmk if you have better ideas.
Note that the deferred value loading only happens for *internal* iterators. The user-visible iterator (DBIter) always prepares the value before returning from Seek/Next/etc. We could go further and add an API to defer that value loading too, but that's most likely not useful for LogDevice, so it doesn't seem worth the complexity for now.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6621
Test Plan: make -j5 check . Will also deploy to some logdevice test clusters and look at stats.
Reviewed By: siying
Differential Revision: D20786930
Pulled By: al13n321
fbshipit-source-id: 6da77d918bad3780522e918f17f4d5513d3e99ee
2020-04-16 00:37:23 +00:00
|
|
|
ASSERT_TRUE(iter->PrepareValue());
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-25 03:50:35 +00:00
|
|
|
EXPECT_EQ("v4", iter->value().ToString());
|
|
|
|
EXPECT_EQ(use_first_key ? 3 : 6,
|
|
|
|
stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
|
|
|
|
|
|
|
|
iter->SeekToLast();
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
EXPECT_EQ(keys[7], iter->key().ToString());
|
|
|
|
EXPECT_EQ(use_first_key ? 4 : 7,
|
|
|
|
stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
|
Properly report IO errors when IndexType::kBinarySearchWithFirstKey is used (#6621)
Summary:
Context: Index type `kBinarySearchWithFirstKey` added the ability for sst file iterator to sometimes report a key from index without reading the corresponding data block. This is useful when sst blocks are cut at some meaningful boundaries (e.g. one block per key prefix), and many seeks land between blocks (e.g. for each prefix, the ranges of keys in different sst files are nearly disjoint, so a typical seek needs to read a data block from only one file even if all files have the prefix). But this added a new error condition, which rocksdb code was really not equipped to deal with: `InternalIterator::value()` may fail with an IO error or Status::Incomplete, but it's just a method returning a Slice, with no way to report error instead. Before this PR, this type of error wasn't handled at all (an empty slice was returned), and kBinarySearchWithFirstKey implementation was considered a prototype.
Now that we (LogDevice) have experimented with kBinarySearchWithFirstKey for a while and confirmed that it's really useful, this PR is adding the missing error handling.
It's a pretty inconvenient situation implementation-wise. The error needs to be reported from InternalIterator when trying to access value. But there are ~700 call sites of `InternalIterator::value()`, most of which either can't hit the error condition (because the iterator is reading from memtable or from index or something) or wouldn't benefit from the deferred loading of the value (e.g. compaction iterator that reads all values anyway). Adding error handling to all these call sites would needlessly bloat the code. So instead I made the deferred value loading optional: only the call sites that may use deferred loading have to call the new method `PrepareValue()` before calling `value()`. The feature is enabled with a new bool argument `allow_unprepared_value` to a bunch of methods that create iterators (it wouldn't make sense to put it in ReadOptions because it's completely internal to iterators, with virtually no user-visible effect). Lmk if you have better ideas.
Note that the deferred value loading only happens for *internal* iterators. The user-visible iterator (DBIter) always prepares the value before returning from Seek/Next/etc. We could go further and add an API to defer that value loading too, but that's most likely not useful for LogDevice, so it doesn't seem worth the complexity for now.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6621
Test Plan: make -j5 check . Will also deploy to some logdevice test clusters and look at stats.
Reviewed By: siying
Differential Revision: D20786930
Pulled By: al13n321
fbshipit-source-id: 6da77d918bad3780522e918f17f4d5513d3e99ee
2020-04-16 00:37:23 +00:00
|
|
|
ASSERT_TRUE(iter->PrepareValue());
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-25 03:50:35 +00:00
|
|
|
EXPECT_EQ("v7", iter->value().ToString());
|
|
|
|
EXPECT_EQ(use_first_key ? 4 : 7,
|
|
|
|
stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
|
|
|
|
|
|
|
|
EXPECT_EQ(4, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
|
|
|
|
|
|
|
|
c.ResetTableReader();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(BlockBasedTableTest, BinaryIndexWithFirstKeyGlobalSeqno) {
|
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
|
|
|
table_options.index_type = BlockBasedTableOptions::kBinarySearchWithFirstKey;
|
|
|
|
table_options.block_cache = NewLRUCache(10000);
|
|
|
|
Options options;
|
|
|
|
options.statistics = CreateDBStatistics();
|
|
|
|
Statistics* stats = options.statistics.get();
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
std::unique_ptr<InternalKeyComparator> comparator(
|
|
|
|
new InternalKeyComparator(BytewiseComparator()));
|
2021-05-05 20:59:21 +00:00
|
|
|
const ImmutableOptions ioptions(options);
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-25 03:50:35 +00:00
|
|
|
const MutableCFOptions moptions(options);
|
|
|
|
|
|
|
|
TableConstructor c(BytewiseComparator(), /* convert_to_internal_key */ false,
|
|
|
|
/* level */ -1, /* largest_seqno */ 42);
|
|
|
|
|
|
|
|
c.Add(InternalKey("b", 0, kTypeValue).Encode().ToString(), "x");
|
|
|
|
c.Add(InternalKey("c", 0, kTypeValue).Encode().ToString(), "y");
|
|
|
|
|
|
|
|
std::vector<std::string> keys;
|
|
|
|
stl_wrappers::KVMap kvmap;
|
|
|
|
c.Finish(options, ioptions, moptions, table_options, *comparator, &keys,
|
|
|
|
&kvmap);
|
|
|
|
ASSERT_EQ(2, keys.size());
|
|
|
|
|
|
|
|
auto reader = c.GetTableReader();
|
|
|
|
auto props = reader->GetTableProperties();
|
|
|
|
ASSERT_EQ(1u, props->num_data_blocks);
|
2020-08-03 22:21:56 +00:00
|
|
|
ReadOptions read_options;
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-25 03:50:35 +00:00
|
|
|
std::unique_ptr<InternalIterator> iter(reader->NewIterator(
|
2020-08-03 22:21:56 +00:00
|
|
|
read_options, /*prefix_extractor=*/nullptr, /*arena=*/nullptr,
|
Properly report IO errors when IndexType::kBinarySearchWithFirstKey is used (#6621)
Summary:
Context: Index type `kBinarySearchWithFirstKey` added the ability for sst file iterator to sometimes report a key from index without reading the corresponding data block. This is useful when sst blocks are cut at some meaningful boundaries (e.g. one block per key prefix), and many seeks land between blocks (e.g. for each prefix, the ranges of keys in different sst files are nearly disjoint, so a typical seek needs to read a data block from only one file even if all files have the prefix). But this added a new error condition, which rocksdb code was really not equipped to deal with: `InternalIterator::value()` may fail with an IO error or Status::Incomplete, but it's just a method returning a Slice, with no way to report error instead. Before this PR, this type of error wasn't handled at all (an empty slice was returned), and kBinarySearchWithFirstKey implementation was considered a prototype.
Now that we (LogDevice) have experimented with kBinarySearchWithFirstKey for a while and confirmed that it's really useful, this PR is adding the missing error handling.
It's a pretty inconvenient situation implementation-wise. The error needs to be reported from InternalIterator when trying to access value. But there are ~700 call sites of `InternalIterator::value()`, most of which either can't hit the error condition (because the iterator is reading from memtable or from index or something) or wouldn't benefit from the deferred loading of the value (e.g. compaction iterator that reads all values anyway). Adding error handling to all these call sites would needlessly bloat the code. So instead I made the deferred value loading optional: only the call sites that may use deferred loading have to call the new method `PrepareValue()` before calling `value()`. The feature is enabled with a new bool argument `allow_unprepared_value` to a bunch of methods that create iterators (it wouldn't make sense to put it in ReadOptions because it's completely internal to iterators, with virtually no user-visible effect). Lmk if you have better ideas.
Note that the deferred value loading only happens for *internal* iterators. The user-visible iterator (DBIter) always prepares the value before returning from Seek/Next/etc. We could go further and add an API to defer that value loading too, but that's most likely not useful for LogDevice, so it doesn't seem worth the complexity for now.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6621
Test Plan: make -j5 check . Will also deploy to some logdevice test clusters and look at stats.
Reviewed By: siying
Differential Revision: D20786930
Pulled By: al13n321
fbshipit-source-id: 6da77d918bad3780522e918f17f4d5513d3e99ee
2020-04-16 00:37:23 +00:00
|
|
|
/*skip_filters=*/false, TableReaderCaller::kUncategorized,
|
|
|
|
/*compaction_readahead_size=*/0, /*allow_unprepared_value=*/true));
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-25 03:50:35 +00:00
|
|
|
|
|
|
|
iter->Seek(InternalKey("a", 0, kTypeValue).Encode().ToString());
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
EXPECT_EQ(InternalKey("b", 42, kTypeValue).Encode().ToString(),
|
|
|
|
iter->key().ToString());
|
|
|
|
EXPECT_NE(keys[0], iter->key().ToString());
|
|
|
|
// Key should have been served from index, without reading data blocks.
|
|
|
|
EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
|
|
|
|
|
Properly report IO errors when IndexType::kBinarySearchWithFirstKey is used (#6621)
Summary:
Context: Index type `kBinarySearchWithFirstKey` added the ability for sst file iterator to sometimes report a key from index without reading the corresponding data block. This is useful when sst blocks are cut at some meaningful boundaries (e.g. one block per key prefix), and many seeks land between blocks (e.g. for each prefix, the ranges of keys in different sst files are nearly disjoint, so a typical seek needs to read a data block from only one file even if all files have the prefix). But this added a new error condition, which rocksdb code was really not equipped to deal with: `InternalIterator::value()` may fail with an IO error or Status::Incomplete, but it's just a method returning a Slice, with no way to report error instead. Before this PR, this type of error wasn't handled at all (an empty slice was returned), and kBinarySearchWithFirstKey implementation was considered a prototype.
Now that we (LogDevice) have experimented with kBinarySearchWithFirstKey for a while and confirmed that it's really useful, this PR is adding the missing error handling.
It's a pretty inconvenient situation implementation-wise. The error needs to be reported from InternalIterator when trying to access value. But there are ~700 call sites of `InternalIterator::value()`, most of which either can't hit the error condition (because the iterator is reading from memtable or from index or something) or wouldn't benefit from the deferred loading of the value (e.g. compaction iterator that reads all values anyway). Adding error handling to all these call sites would needlessly bloat the code. So instead I made the deferred value loading optional: only the call sites that may use deferred loading have to call the new method `PrepareValue()` before calling `value()`. The feature is enabled with a new bool argument `allow_unprepared_value` to a bunch of methods that create iterators (it wouldn't make sense to put it in ReadOptions because it's completely internal to iterators, with virtually no user-visible effect). Lmk if you have better ideas.
Note that the deferred value loading only happens for *internal* iterators. The user-visible iterator (DBIter) always prepares the value before returning from Seek/Next/etc. We could go further and add an API to defer that value loading too, but that's most likely not useful for LogDevice, so it doesn't seem worth the complexity for now.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6621
Test Plan: make -j5 check . Will also deploy to some logdevice test clusters and look at stats.
Reviewed By: siying
Differential Revision: D20786930
Pulled By: al13n321
fbshipit-source-id: 6da77d918bad3780522e918f17f4d5513d3e99ee
2020-04-16 00:37:23 +00:00
|
|
|
ASSERT_TRUE(iter->PrepareValue());
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-25 03:50:35 +00:00
|
|
|
EXPECT_EQ("x", iter->value().ToString());
|
|
|
|
EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
|
|
|
|
EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
|
|
|
|
EXPECT_EQ(InternalKey("b", 42, kTypeValue).Encode().ToString(),
|
|
|
|
iter->key().ToString());
|
|
|
|
|
|
|
|
c.ResetTableReader();
|
|
|
|
}
|
|
|
|
|
2013-10-10 18:43:24 +00:00
|
|
|
// It's very hard to figure out the index block size of a block accurately.
|
|
|
|
// To make sure we get the index size, we just make sure as key number
|
|
|
|
// grows, the filter block size also grows.
|
2018-06-05 02:59:44 +00:00
|
|
|
TEST_P(BlockBasedTableTest, IndexSizeStat) {
|
2013-10-10 18:43:24 +00:00
|
|
|
uint64_t last_index_size = 0;
|
|
|
|
|
|
|
|
// we need to use random keys since the pure human readable texts
|
|
|
|
// may be well compressed, resulting insignifcant change of index
|
|
|
|
// block size.
|
|
|
|
Random rnd(test::RandomSeed());
|
|
|
|
std::vector<std::string> keys;
|
|
|
|
|
|
|
|
for (int i = 0; i < 100; ++i) {
|
2020-07-09 21:33:42 +00:00
|
|
|
keys.push_back(rnd.RandomString(10000));
|
2013-10-10 18:43:24 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Each time we load one more key to the table. the table index block
|
|
|
|
// size is expected to be larger than last time's.
|
|
|
|
for (size_t i = 1; i < keys.size(); ++i) {
|
2016-08-19 22:10:31 +00:00
|
|
|
TableConstructor c(BytewiseComparator(),
|
|
|
|
true /* convert_to_internal_key_ */);
|
2013-10-10 18:43:24 +00:00
|
|
|
for (size_t j = 0; j < i; ++j) {
|
|
|
|
c.Add(keys[j], "val");
|
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<std::string> ks;
|
2015-09-02 20:58:22 +00:00
|
|
|
stl_wrappers::KVMap kvmap;
|
2013-11-20 06:00:48 +00:00
|
|
|
Options options;
|
2013-10-10 18:43:24 +00:00
|
|
|
options.compression = kNoCompression;
|
2018-06-05 02:59:44 +00:00
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
2014-08-25 21:22:05 +00:00
|
|
|
table_options.block_restart_interval = 1;
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
2013-10-10 18:43:24 +00:00
|
|
|
|
2021-05-05 20:59:21 +00:00
|
|
|
const ImmutableOptions ioptions(options);
|
2018-05-21 21:33:55 +00:00
|
|
|
const MutableCFOptions moptions(options);
|
|
|
|
c.Finish(options, ioptions, moptions, table_options,
|
2014-08-25 21:22:05 +00:00
|
|
|
GetPlainInternalComparator(options.comparator), &ks, &kvmap);
|
2014-08-25 23:14:30 +00:00
|
|
|
auto index_size = c.GetTableReader()->GetTableProperties()->index_size;
|
2013-10-10 18:43:24 +00:00
|
|
|
ASSERT_GT(index_size, last_index_size);
|
|
|
|
last_index_size = index_size;
|
Adding pin_l0_filter_and_index_blocks_in_cache feature and related fixes.
Summary:
When a block based table file is opened, if prefetch_index_and_filter is true, it will prefetch the index and filter blocks, putting them into the block cache.
What this feature adds: when a L0 block based table file is opened, if pin_l0_filter_and_index_blocks_in_cache is true in the options (and prefetch_index_and_filter is true), then the filter and index blocks aren't released back to the block cache at the end of BlockBasedTableReader::Open(). Instead the table reader takes ownership of them, hence pinning them, ie. the LRU cache will never push them out. Meanwhile in the table reader, further accesses will not hit the block cache, thus avoiding lock contention.
Test Plan:
'export TEST_TMPDIR=/dev/shm/ && DISABLE_JEMALLOC=1 OPT=-g make all valgrind_check -j32' is OK.
I didn't run the Java tests, I don't have Java set up on my devserver.
Reviewers: sdong
Reviewed By: sdong
Subscribers: andrewkr, dhruba
Differential Revision: https://reviews.facebook.net/D56133
2016-04-01 17:42:39 +00:00
|
|
|
c.ResetTableReader();
|
2013-10-10 18:43:24 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-06-05 02:59:44 +00:00
|
|
|
TEST_P(BlockBasedTableTest, NumBlockStat) {
|
2013-10-10 18:43:24 +00:00
|
|
|
Random rnd(test::RandomSeed());
|
2016-08-19 22:10:31 +00:00
|
|
|
TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
|
2013-10-10 18:43:24 +00:00
|
|
|
Options options;
|
|
|
|
options.compression = kNoCompression;
|
2018-06-05 02:59:44 +00:00
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
2014-08-25 21:22:05 +00:00
|
|
|
table_options.block_restart_interval = 1;
|
|
|
|
table_options.block_size = 1000;
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
2013-10-10 18:43:24 +00:00
|
|
|
|
|
|
|
for (int i = 0; i < 10; ++i) {
|
|
|
|
// the key/val are slightly smaller than block size, so that each block
|
|
|
|
// holds roughly one key/value pair.
|
2020-07-09 21:33:42 +00:00
|
|
|
c.Add(rnd.RandomString(900), "val");
|
2013-10-10 18:43:24 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<std::string> ks;
|
2015-09-02 20:58:22 +00:00
|
|
|
stl_wrappers::KVMap kvmap;
|
2021-05-05 20:59:21 +00:00
|
|
|
const ImmutableOptions ioptions(options);
|
2018-05-21 21:33:55 +00:00
|
|
|
const MutableCFOptions moptions(options);
|
|
|
|
c.Finish(options, ioptions, moptions, table_options,
|
2014-08-25 21:22:05 +00:00
|
|
|
GetPlainInternalComparator(options.comparator), &ks, &kvmap);
|
2014-02-05 00:21:47 +00:00
|
|
|
ASSERT_EQ(kvmap.size(),
|
2014-08-25 23:14:30 +00:00
|
|
|
c.GetTableReader()->GetTableProperties()->num_data_blocks);
|
Adding pin_l0_filter_and_index_blocks_in_cache feature and related fixes.
Summary:
When a block based table file is opened, if prefetch_index_and_filter is true, it will prefetch the index and filter blocks, putting them into the block cache.
What this feature adds: when a L0 block based table file is opened, if pin_l0_filter_and_index_blocks_in_cache is true in the options (and prefetch_index_and_filter is true), then the filter and index blocks aren't released back to the block cache at the end of BlockBasedTableReader::Open(). Instead the table reader takes ownership of them, hence pinning them, ie. the LRU cache will never push them out. Meanwhile in the table reader, further accesses will not hit the block cache, thus avoiding lock contention.
Test Plan:
'export TEST_TMPDIR=/dev/shm/ && DISABLE_JEMALLOC=1 OPT=-g make all valgrind_check -j32' is OK.
I didn't run the Java tests, I don't have Java set up on my devserver.
Reviewers: sdong
Reviewed By: sdong
Subscribers: andrewkr, dhruba
Differential Revision: https://reviews.facebook.net/D56133
2016-04-01 17:42:39 +00:00
|
|
|
c.ResetTableReader();
|
2013-10-10 18:43:24 +00:00
|
|
|
}
|
|
|
|
|
2019-07-17 20:02:00 +00:00
|
|
|
TEST_P(BlockBasedTableTest, TracingGetTest) {
|
|
|
|
TableConstructor c(BytewiseComparator());
|
|
|
|
Options options;
|
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
|
|
|
options.create_if_missing = true;
|
|
|
|
table_options.block_cache = NewLRUCache(1024 * 1024, 0);
|
|
|
|
table_options.cache_index_and_filter_blocks = true;
|
Hide deprecated, inefficient block-based filter from public API (#9535)
Summary:
This change removes the ability to configure the deprecated,
inefficient block-based filter in the public API. Options that would
have enabled it now use "full" (and optionally partitioned) filters.
Existing block-based filters can still be read and used, and a "back
door" way to build them still exists, for testing and in case of trouble.
About the only way this removal would cause an issue for users is if
temporary memory for filter construction greatly increases. In
HISTORY.md we suggest a few possible mitigations: partitioned filters,
smaller SST files, or setting reserve_table_builder_memory=true.
Or users who have customized a FilterPolicy using the
CreateFilter/KeyMayMatch mechanism removed in https://github.com/facebook/rocksdb/issues/9501 will have to upgrade
their code. (It's long past time for people to move to the new
builder/reader customization interface.)
This change also introduces some internal-use-only configuration strings
for testing specific filter implementations while bypassing some
compatibility / intelligence logic. This is intended to hint at a path
toward making FilterPolicy Customizable, but it also gives us a "back
door" way to configure block-based filter.
Aside: updated db_bench so that -readonly implies -use_existing_db
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9535
Test Plan:
Unit tests updated. Specifically,
* BlockBasedTableTest.BlockReadCountTest is tweaked to validate the back
door configuration interface and ignoring of `use_block_based_builder`.
* BlockBasedTableTest.TracingGetTest is migrated from testing
block-based filter access pattern to full filter access patter, by
re-ordering some things.
* Options test (pretty self-explanatory)
Performance test - create with `./db_bench -db=/dev/shm/rocksdb1 -bloom_bits=10 -cache_index_and_filter_blocks=1 -benchmarks=fillrandom -num=10000000 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=0` with and without `-use_block_based_filter`, which creates a DB with 21 SST files in L0. Read with `./db_bench -db=/dev/shm/rocksdb1 -readonly -bloom_bits=10 -cache_index_and_filter_blocks=1 -benchmarks=readrandom -num=10000000 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=0 -duration=30`
Without -use_block_based_filter: readrandom 464 ops/sec, 689280 KB DB
With -use_block_based_filter: readrandom 169 ops/sec, 690996 KB DB
No consistent difference with fillrandom
Reviewed By: jay-zhuang
Differential Revision: D34153871
Pulled By: pdillinger
fbshipit-source-id: 31f4a933c542f8f09aca47fa64aec67832a69738
2022-02-12 15:04:09 +00:00
|
|
|
table_options.filter_policy.reset(NewBloomFilterPolicy(10));
|
2019-07-17 20:02:00 +00:00
|
|
|
options.table_factory.reset(new BlockBasedTableFactory(table_options));
|
|
|
|
SetupTracingTest(&c);
|
|
|
|
std::vector<std::string> keys;
|
|
|
|
stl_wrappers::KVMap kvmap;
|
2021-05-05 20:59:21 +00:00
|
|
|
ImmutableOptions ioptions(options);
|
2019-07-17 20:02:00 +00:00
|
|
|
MutableCFOptions moptions(options);
|
|
|
|
c.Finish(options, ioptions, moptions, table_options,
|
|
|
|
GetPlainInternalComparator(options.comparator), &keys, &kvmap);
|
Refactor block cache tracing w/improved MultiGet (#11339)
Summary:
After https://github.com/facebook/rocksdb/issues/11301, I wasn't sure whether I had regressed block cache tracing with MultiGet. Demo PR https://github.com/facebook/rocksdb/issues/11330 shows the flawed state of tracing MultiGet before my change, and based on the unit test, there was essentially no change in tracing behavior with https://github.com/facebook/rocksdb/issues/11301. This change is to leave that code and behavior better than I found it.
This change is not intended to change any production behaviors except when block cache tracing is active, though might improve general read path efficiency by disabling some related tracking when such tracing is disabled.
More detail on production code:
* Refactoring to consolidate the construction of BlockCacheTraceRecord, and other related functionality, in block-based table reader, though it's somewhat awkward to preserve an optimization to avoid copying Slices into temporary strings in BlockCacheLookupContext.
* Accurately track cache hits and misses (etc.) for each data block accessed by a MultiGet(). (Previously reported hits as misses.)
* Reduced repeated checking of `block_cache_tracer_` state (by creating lookup_context only when active) for efficiency and to reduce the risk of corner case bugs where tracing is enabled or disabled for different parts of a read op. (See a TODO below)
* Improved estimate calculation for num_keys_in_block (see code comment)
Possible follow-up:
* `XXX:` use_cache=true means double cache query? (possible double-query of block cache when allow_mmap_reads=true)
* `TODO:` need more than one lookup_context here to track individual filter and index partition hits and misses
* `TODO:` optimize more state checks of `block_cache_tracer_` down to `lookup_context != nullptr`
* Pre-existing `XXX:` There appear to be 'break' statements above that bypass this writing of the block cache trace record
* Expand test coverage (see below)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11339
Test Plan:
* Added a basic unit test for block cache tracing MultiGet, for now just covering one data block with two keys.
* Added HitMissCountingCache to independently verify that the actual block cache trace and expected block cache trace also agree with the actual number of cache hits / misses (nothing missing or mislabeled). For now only used with MultiGet test.
* Better testing of num_keys_in_block, for now just with MultiGet
* Misc improvements to table_test to improve clarity, such as making it clear that certain keys are auto-inserted at the start of every test.
Performance test:
Testing multireadrandom as in https://github.com/facebook/rocksdb/issues/11301, except averaging over distinct runs rather than [-X30] which doesn't seem to sufficiently reset after each run to work as an independent test run.
Base with revert of 11301: 3148926 ops/sec
Base: 3019146 ops/sec
New: 2999529 ops/sec
Possibly a tiny MultiGet CPU regression with this change. We are now always allocating an additional vector for the LookupContexts. I'm still contemplating options to try to correct the regression in https://github.com/facebook/rocksdb/issues/11301.
Testing readrandom:
Base with revert of 11301: 2311988
Base: 2281726
New: 2299722
Possibly a tiny Get CPU improvement with this change. We are now avoiding some unnecessary LookupContext population.
Reviewed By: akankshamahajan15
Differential Revision: D44557845
Pulled By: pdillinger
fbshipit-source-id: b841691799d2a48fb59cc8880dc7cbb1e107ae3d
2023-04-07 19:55:56 +00:00
|
|
|
InternalKey internal_key(auto_add_key1, 0, kTypeValue);
|
2019-07-17 20:02:00 +00:00
|
|
|
std::string encoded_key = internal_key.Encode().ToString();
|
|
|
|
for (uint32_t i = 1; i <= 2; i++) {
|
|
|
|
PinnableSlice value;
|
Refactor block cache tracing w/improved MultiGet (#11339)
Summary:
After https://github.com/facebook/rocksdb/issues/11301, I wasn't sure whether I had regressed block cache tracing with MultiGet. Demo PR https://github.com/facebook/rocksdb/issues/11330 shows the flawed state of tracing MultiGet before my change, and based on the unit test, there was essentially no change in tracing behavior with https://github.com/facebook/rocksdb/issues/11301. This change is to leave that code and behavior better than I found it.
This change is not intended to change any production behaviors except when block cache tracing is active, though might improve general read path efficiency by disabling some related tracking when such tracing is disabled.
More detail on production code:
* Refactoring to consolidate the construction of BlockCacheTraceRecord, and other related functionality, in block-based table reader, though it's somewhat awkward to preserve an optimization to avoid copying Slices into temporary strings in BlockCacheLookupContext.
* Accurately track cache hits and misses (etc.) for each data block accessed by a MultiGet(). (Previously reported hits as misses.)
* Reduced repeated checking of `block_cache_tracer_` state (by creating lookup_context only when active) for efficiency and to reduce the risk of corner case bugs where tracing is enabled or disabled for different parts of a read op. (See a TODO below)
* Improved estimate calculation for num_keys_in_block (see code comment)
Possible follow-up:
* `XXX:` use_cache=true means double cache query? (possible double-query of block cache when allow_mmap_reads=true)
* `TODO:` need more than one lookup_context here to track individual filter and index partition hits and misses
* `TODO:` optimize more state checks of `block_cache_tracer_` down to `lookup_context != nullptr`
* Pre-existing `XXX:` There appear to be 'break' statements above that bypass this writing of the block cache trace record
* Expand test coverage (see below)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11339
Test Plan:
* Added a basic unit test for block cache tracing MultiGet, for now just covering one data block with two keys.
* Added HitMissCountingCache to independently verify that the actual block cache trace and expected block cache trace also agree with the actual number of cache hits / misses (nothing missing or mislabeled). For now only used with MultiGet test.
* Better testing of num_keys_in_block, for now just with MultiGet
* Misc improvements to table_test to improve clarity, such as making it clear that certain keys are auto-inserted at the start of every test.
Performance test:
Testing multireadrandom as in https://github.com/facebook/rocksdb/issues/11301, except averaging over distinct runs rather than [-X30] which doesn't seem to sufficiently reset after each run to work as an independent test run.
Base with revert of 11301: 3148926 ops/sec
Base: 3019146 ops/sec
New: 2999529 ops/sec
Possibly a tiny MultiGet CPU regression with this change. We are now always allocating an additional vector for the LookupContexts. I'm still contemplating options to try to correct the regression in https://github.com/facebook/rocksdb/issues/11301.
Testing readrandom:
Base with revert of 11301: 2311988
Base: 2281726
New: 2299722
Possibly a tiny Get CPU improvement with this change. We are now avoiding some unnecessary LookupContext population.
Reviewed By: akankshamahajan15
Differential Revision: D44557845
Pulled By: pdillinger
fbshipit-source-id: b841691799d2a48fb59cc8880dc7cbb1e107ae3d
2023-04-07 19:55:56 +00:00
|
|
|
GetContext get_context(
|
|
|
|
options.comparator, nullptr, nullptr, nullptr, GetContext::kNotFound,
|
|
|
|
auto_add_key1, &value, nullptr, nullptr, nullptr, true, nullptr,
|
|
|
|
nullptr, nullptr, nullptr, nullptr, nullptr, /*tracing_get_id=*/i);
|
2019-07-17 20:02:00 +00:00
|
|
|
get_perf_context()->Reset();
|
|
|
|
ASSERT_OK(c.GetTableReader()->Get(ReadOptions(), encoded_key, &get_context,
|
|
|
|
moptions.prefix_extractor.get()));
|
|
|
|
ASSERT_EQ(get_context.State(), GetContext::kFound);
|
|
|
|
ASSERT_EQ(value.ToString(), kDummyValue);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Verify traces.
|
|
|
|
std::vector<BlockCacheTraceRecord> expected_records;
|
|
|
|
// The first two records should be prefetching index and filter blocks.
|
|
|
|
BlockCacheTraceRecord record;
|
|
|
|
record.block_type = TraceType::kBlockTraceIndexBlock;
|
|
|
|
record.caller = TableReaderCaller::kPrefetch;
|
2022-10-21 19:15:35 +00:00
|
|
|
record.is_cache_hit = false;
|
|
|
|
record.no_insert = false;
|
2019-07-17 20:02:00 +00:00
|
|
|
expected_records.push_back(record);
|
|
|
|
record.block_type = TraceType::kBlockTraceFilterBlock;
|
|
|
|
expected_records.push_back(record);
|
|
|
|
// Then we should have three records for one index, one filter, and one data
|
|
|
|
// block access.
|
|
|
|
record.get_id = 1;
|
Hide deprecated, inefficient block-based filter from public API (#9535)
Summary:
This change removes the ability to configure the deprecated,
inefficient block-based filter in the public API. Options that would
have enabled it now use "full" (and optionally partitioned) filters.
Existing block-based filters can still be read and used, and a "back
door" way to build them still exists, for testing and in case of trouble.
About the only way this removal would cause an issue for users is if
temporary memory for filter construction greatly increases. In
HISTORY.md we suggest a few possible mitigations: partitioned filters,
smaller SST files, or setting reserve_table_builder_memory=true.
Or users who have customized a FilterPolicy using the
CreateFilter/KeyMayMatch mechanism removed in https://github.com/facebook/rocksdb/issues/9501 will have to upgrade
their code. (It's long past time for people to move to the new
builder/reader customization interface.)
This change also introduces some internal-use-only configuration strings
for testing specific filter implementations while bypassing some
compatibility / intelligence logic. This is intended to hint at a path
toward making FilterPolicy Customizable, but it also gives us a "back
door" way to configure block-based filter.
Aside: updated db_bench so that -readonly implies -use_existing_db
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9535
Test Plan:
Unit tests updated. Specifically,
* BlockBasedTableTest.BlockReadCountTest is tweaked to validate the back
door configuration interface and ignoring of `use_block_based_builder`.
* BlockBasedTableTest.TracingGetTest is migrated from testing
block-based filter access pattern to full filter access patter, by
re-ordering some things.
* Options test (pretty self-explanatory)
Performance test - create with `./db_bench -db=/dev/shm/rocksdb1 -bloom_bits=10 -cache_index_and_filter_blocks=1 -benchmarks=fillrandom -num=10000000 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=0` with and without `-use_block_based_filter`, which creates a DB with 21 SST files in L0. Read with `./db_bench -db=/dev/shm/rocksdb1 -readonly -bloom_bits=10 -cache_index_and_filter_blocks=1 -benchmarks=readrandom -num=10000000 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=0 -duration=30`
Without -use_block_based_filter: readrandom 464 ops/sec, 689280 KB DB
With -use_block_based_filter: readrandom 169 ops/sec, 690996 KB DB
No consistent difference with fillrandom
Reviewed By: jay-zhuang
Differential Revision: D34153871
Pulled By: pdillinger
fbshipit-source-id: 31f4a933c542f8f09aca47fa64aec67832a69738
2022-02-12 15:04:09 +00:00
|
|
|
record.block_type = TraceType::kBlockTraceFilterBlock;
|
2019-07-17 20:02:00 +00:00
|
|
|
record.caller = TableReaderCaller::kUserGet;
|
2022-10-21 19:15:35 +00:00
|
|
|
record.get_from_user_specified_snapshot = false;
|
2019-07-17 20:02:00 +00:00
|
|
|
record.referenced_key = encoded_key;
|
2022-10-21 19:15:35 +00:00
|
|
|
record.referenced_key_exist_in_block = true;
|
|
|
|
record.is_cache_hit = true;
|
2019-07-17 20:02:00 +00:00
|
|
|
expected_records.push_back(record);
|
Hide deprecated, inefficient block-based filter from public API (#9535)
Summary:
This change removes the ability to configure the deprecated,
inefficient block-based filter in the public API. Options that would
have enabled it now use "full" (and optionally partitioned) filters.
Existing block-based filters can still be read and used, and a "back
door" way to build them still exists, for testing and in case of trouble.
About the only way this removal would cause an issue for users is if
temporary memory for filter construction greatly increases. In
HISTORY.md we suggest a few possible mitigations: partitioned filters,
smaller SST files, or setting reserve_table_builder_memory=true.
Or users who have customized a FilterPolicy using the
CreateFilter/KeyMayMatch mechanism removed in https://github.com/facebook/rocksdb/issues/9501 will have to upgrade
their code. (It's long past time for people to move to the new
builder/reader customization interface.)
This change also introduces some internal-use-only configuration strings
for testing specific filter implementations while bypassing some
compatibility / intelligence logic. This is intended to hint at a path
toward making FilterPolicy Customizable, but it also gives us a "back
door" way to configure block-based filter.
Aside: updated db_bench so that -readonly implies -use_existing_db
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9535
Test Plan:
Unit tests updated. Specifically,
* BlockBasedTableTest.BlockReadCountTest is tweaked to validate the back
door configuration interface and ignoring of `use_block_based_builder`.
* BlockBasedTableTest.TracingGetTest is migrated from testing
block-based filter access pattern to full filter access patter, by
re-ordering some things.
* Options test (pretty self-explanatory)
Performance test - create with `./db_bench -db=/dev/shm/rocksdb1 -bloom_bits=10 -cache_index_and_filter_blocks=1 -benchmarks=fillrandom -num=10000000 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=0` with and without `-use_block_based_filter`, which creates a DB with 21 SST files in L0. Read with `./db_bench -db=/dev/shm/rocksdb1 -readonly -bloom_bits=10 -cache_index_and_filter_blocks=1 -benchmarks=readrandom -num=10000000 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=0 -duration=30`
Without -use_block_based_filter: readrandom 464 ops/sec, 689280 KB DB
With -use_block_based_filter: readrandom 169 ops/sec, 690996 KB DB
No consistent difference with fillrandom
Reviewed By: jay-zhuang
Differential Revision: D34153871
Pulled By: pdillinger
fbshipit-source-id: 31f4a933c542f8f09aca47fa64aec67832a69738
2022-02-12 15:04:09 +00:00
|
|
|
record.block_type = TraceType::kBlockTraceIndexBlock;
|
2019-07-17 20:02:00 +00:00
|
|
|
expected_records.push_back(record);
|
2022-10-21 19:15:35 +00:00
|
|
|
record.is_cache_hit = false;
|
2019-07-17 20:02:00 +00:00
|
|
|
record.block_type = TraceType::kBlockTraceDataBlock;
|
|
|
|
expected_records.push_back(record);
|
|
|
|
// The second get should all observe cache hits.
|
2022-10-21 19:15:35 +00:00
|
|
|
record.is_cache_hit = true;
|
2019-07-17 20:02:00 +00:00
|
|
|
record.get_id = 2;
|
Hide deprecated, inefficient block-based filter from public API (#9535)
Summary:
This change removes the ability to configure the deprecated,
inefficient block-based filter in the public API. Options that would
have enabled it now use "full" (and optionally partitioned) filters.
Existing block-based filters can still be read and used, and a "back
door" way to build them still exists, for testing and in case of trouble.
About the only way this removal would cause an issue for users is if
temporary memory for filter construction greatly increases. In
HISTORY.md we suggest a few possible mitigations: partitioned filters,
smaller SST files, or setting reserve_table_builder_memory=true.
Or users who have customized a FilterPolicy using the
CreateFilter/KeyMayMatch mechanism removed in https://github.com/facebook/rocksdb/issues/9501 will have to upgrade
their code. (It's long past time for people to move to the new
builder/reader customization interface.)
This change also introduces some internal-use-only configuration strings
for testing specific filter implementations while bypassing some
compatibility / intelligence logic. This is intended to hint at a path
toward making FilterPolicy Customizable, but it also gives us a "back
door" way to configure block-based filter.
Aside: updated db_bench so that -readonly implies -use_existing_db
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9535
Test Plan:
Unit tests updated. Specifically,
* BlockBasedTableTest.BlockReadCountTest is tweaked to validate the back
door configuration interface and ignoring of `use_block_based_builder`.
* BlockBasedTableTest.TracingGetTest is migrated from testing
block-based filter access pattern to full filter access patter, by
re-ordering some things.
* Options test (pretty self-explanatory)
Performance test - create with `./db_bench -db=/dev/shm/rocksdb1 -bloom_bits=10 -cache_index_and_filter_blocks=1 -benchmarks=fillrandom -num=10000000 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=0` with and without `-use_block_based_filter`, which creates a DB with 21 SST files in L0. Read with `./db_bench -db=/dev/shm/rocksdb1 -readonly -bloom_bits=10 -cache_index_and_filter_blocks=1 -benchmarks=readrandom -num=10000000 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=0 -duration=30`
Without -use_block_based_filter: readrandom 464 ops/sec, 689280 KB DB
With -use_block_based_filter: readrandom 169 ops/sec, 690996 KB DB
No consistent difference with fillrandom
Reviewed By: jay-zhuang
Differential Revision: D34153871
Pulled By: pdillinger
fbshipit-source-id: 31f4a933c542f8f09aca47fa64aec67832a69738
2022-02-12 15:04:09 +00:00
|
|
|
record.block_type = TraceType::kBlockTraceFilterBlock;
|
2019-07-17 20:02:00 +00:00
|
|
|
record.caller = TableReaderCaller::kUserGet;
|
2022-10-21 19:15:35 +00:00
|
|
|
record.get_from_user_specified_snapshot = false;
|
2019-07-17 20:02:00 +00:00
|
|
|
record.referenced_key = encoded_key;
|
|
|
|
expected_records.push_back(record);
|
Hide deprecated, inefficient block-based filter from public API (#9535)
Summary:
This change removes the ability to configure the deprecated,
inefficient block-based filter in the public API. Options that would
have enabled it now use "full" (and optionally partitioned) filters.
Existing block-based filters can still be read and used, and a "back
door" way to build them still exists, for testing and in case of trouble.
About the only way this removal would cause an issue for users is if
temporary memory for filter construction greatly increases. In
HISTORY.md we suggest a few possible mitigations: partitioned filters,
smaller SST files, or setting reserve_table_builder_memory=true.
Or users who have customized a FilterPolicy using the
CreateFilter/KeyMayMatch mechanism removed in https://github.com/facebook/rocksdb/issues/9501 will have to upgrade
their code. (It's long past time for people to move to the new
builder/reader customization interface.)
This change also introduces some internal-use-only configuration strings
for testing specific filter implementations while bypassing some
compatibility / intelligence logic. This is intended to hint at a path
toward making FilterPolicy Customizable, but it also gives us a "back
door" way to configure block-based filter.
Aside: updated db_bench so that -readonly implies -use_existing_db
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9535
Test Plan:
Unit tests updated. Specifically,
* BlockBasedTableTest.BlockReadCountTest is tweaked to validate the back
door configuration interface and ignoring of `use_block_based_builder`.
* BlockBasedTableTest.TracingGetTest is migrated from testing
block-based filter access pattern to full filter access patter, by
re-ordering some things.
* Options test (pretty self-explanatory)
Performance test - create with `./db_bench -db=/dev/shm/rocksdb1 -bloom_bits=10 -cache_index_and_filter_blocks=1 -benchmarks=fillrandom -num=10000000 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=0` with and without `-use_block_based_filter`, which creates a DB with 21 SST files in L0. Read with `./db_bench -db=/dev/shm/rocksdb1 -readonly -bloom_bits=10 -cache_index_and_filter_blocks=1 -benchmarks=readrandom -num=10000000 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=0 -duration=30`
Without -use_block_based_filter: readrandom 464 ops/sec, 689280 KB DB
With -use_block_based_filter: readrandom 169 ops/sec, 690996 KB DB
No consistent difference with fillrandom
Reviewed By: jay-zhuang
Differential Revision: D34153871
Pulled By: pdillinger
fbshipit-source-id: 31f4a933c542f8f09aca47fa64aec67832a69738
2022-02-12 15:04:09 +00:00
|
|
|
record.block_type = TraceType::kBlockTraceIndexBlock;
|
2019-07-17 20:02:00 +00:00
|
|
|
expected_records.push_back(record);
|
|
|
|
record.block_type = TraceType::kBlockTraceDataBlock;
|
|
|
|
expected_records.push_back(record);
|
|
|
|
VerifyBlockAccessTrace(&c, expected_records);
|
|
|
|
c.ResetTableReader();
|
|
|
|
}
|
|
|
|
|
2023-12-06 21:48:15 +00:00
|
|
|
void GenerateKVMap(TableConstructor* c) {
|
|
|
|
int num_block = 100;
|
|
|
|
Random rnd(101);
|
|
|
|
uint32_t key = 0;
|
|
|
|
for (int block = 0; block < num_block; block++) {
|
|
|
|
for (int i = 0; i < 16; i++) {
|
|
|
|
char k[9] = {0};
|
|
|
|
// Internal key is constructed directly from this key,
|
|
|
|
// and internal key size is required to be >= 8 bytes,
|
|
|
|
// so use %08u as the format string.
|
|
|
|
snprintf(k, sizeof(k), "%08u", key);
|
|
|
|
std::string v = rnd.RandomString(256);
|
|
|
|
InternalKey ikey(std::string(k), 0, kTypeValue);
|
|
|
|
c->Add(ikey.Encode().ToString(), rnd.RandomString(256));
|
|
|
|
key++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void WarmUpCache(TableConstructor* c, const MutableCFOptions& moptions,
|
|
|
|
const std::vector<std::string>& warm_keys) {
|
|
|
|
ReadOptions ro;
|
|
|
|
std::unique_ptr<InternalIterator> iter(c->GetTableReader()->NewIterator(
|
|
|
|
ro, moptions.prefix_extractor.get(), nullptr, false,
|
|
|
|
TableReaderCaller::kUncategorized));
|
|
|
|
size_t i = 0;
|
|
|
|
while (i < warm_keys.size()) {
|
|
|
|
InternalKey ikey(warm_keys[i], 0, kTypeValue);
|
|
|
|
iter->Seek(ikey.Encode().ToString());
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
i++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(BlockBasedTableTest, BlockCacheLookupSeqScans) {
|
|
|
|
Options options;
|
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
|
|
|
options.create_if_missing = true;
|
2024-04-27 03:05:30 +00:00
|
|
|
options.compression = kNoCompression;
|
2023-12-14 19:25:51 +00:00
|
|
|
options.statistics = CreateDBStatistics();
|
2023-12-06 21:48:15 +00:00
|
|
|
table_options.index_type =
|
|
|
|
BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
|
|
|
|
table_options.block_cache = NewLRUCache(1024 * 1024, 0);
|
|
|
|
table_options.cache_index_and_filter_blocks = true;
|
|
|
|
table_options.filter_policy.reset(NewBloomFilterPolicy(10, true));
|
|
|
|
table_options.block_align = true;
|
|
|
|
options.table_factory.reset(new BlockBasedTableFactory(table_options));
|
2024-04-27 03:05:30 +00:00
|
|
|
ASSERT_OK(options.table_factory->ValidateOptions(
|
|
|
|
DBOptions(options), ColumnFamilyOptions(options)));
|
2023-12-06 21:48:15 +00:00
|
|
|
|
|
|
|
TableConstructor c(BytewiseComparator());
|
|
|
|
GenerateKVMap(&c);
|
|
|
|
|
|
|
|
std::vector<std::string> keys;
|
|
|
|
stl_wrappers::KVMap kvmap;
|
|
|
|
ImmutableOptions ioptions(options);
|
|
|
|
MutableCFOptions moptions(options);
|
|
|
|
const InternalKeyComparator internal_comparator(options.comparator);
|
|
|
|
|
|
|
|
c.Finish(options, ioptions, moptions, table_options, internal_comparator,
|
|
|
|
&keys, &kvmap);
|
|
|
|
|
Prefer static_cast in place of most reinterpret_cast (#12308)
Summary:
The following are risks associated with pointer-to-pointer reinterpret_cast:
* Can produce the "wrong result" (crash or memory corruption). IIRC, in theory this can happen for any up-cast or down-cast for a non-standard-layout type, though in practice would only happen for multiple inheritance cases (where the base class pointer might be "inside" the derived object). We don't use multiple inheritance a lot, but we do.
* Can mask useful compiler errors upon code change, including converting between unrelated pointer types that you are expecting to be related, and converting between pointer and scalar types unintentionally.
I can only think of some obscure cases where static_cast could be troublesome when it compiles as a replacement:
* Going through `void*` could plausibly cause unnecessary or broken pointer arithmetic. Suppose we have
`struct Derived: public Base1, public Base2`. If we have `Derived*` -> `void*` -> `Base2*` -> `Derived*` through reinterpret casts, this could plausibly work (though technical UB) assuming the `Base2*` is not dereferenced. Changing to static cast could introduce breaking pointer arithmetic.
* Unnecessary (but safe) pointer arithmetic could arise in a case like `Derived*` -> `Base2*` -> `Derived*` where before the Base2 pointer might not have been dereferenced. This could potentially affect performance.
With some light scripting, I tried replacing pointer-to-pointer reinterpret_casts with static_cast and kept the cases that still compile. Most occurrences of reinterpret_cast have successfully been changed (except for java/ and third-party/). 294 changed, 257 remain.
A couple of related interventions included here:
* Previously Cache::Handle was not actually derived from in the implementations and just used as a `void*` stand-in with reinterpret_cast. Now there is a relationship to allow static_cast. In theory, this could introduce pointer arithmetic (as described above) but is unlikely without multiple inheritance AND non-empty Cache::Handle.
* Remove some unnecessary casts to void* as this is allowed to be implicit (for better or worse).
Most of the remaining reinterpret_casts are for converting to/from raw bytes of objects. We could consider better idioms for these patterns in follow-up work.
I wish there were a way to implement a template variant of static_cast that would only compile if no pointer arithmetic is generated, but best I can tell, this is not possible. AFAIK the best you could do is a dynamic check that the void* conversion after the static cast is unchanged.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12308
Test Plan: existing tests, CI
Reviewed By: ltamasi
Differential Revision: D53204947
Pulled By: pdillinger
fbshipit-source-id: 9de23e618263b0d5b9820f4e15966876888a16e2
2024-02-07 18:44:11 +00:00
|
|
|
BlockBasedTable* bbt = static_cast<BlockBasedTable*>(c.GetTableReader());
|
2023-12-06 21:48:15 +00:00
|
|
|
BlockHandle block_handle;
|
|
|
|
|
|
|
|
ReadOptions read_options;
|
|
|
|
read_options.auto_readahead_size = true;
|
|
|
|
Slice ub = Slice("00000805");
|
|
|
|
Slice* ub_ptr = &ub;
|
|
|
|
read_options.iterate_upper_bound = ub_ptr;
|
|
|
|
read_options.readahead_size = 16384;
|
|
|
|
|
|
|
|
// Test various functionalities -
|
|
|
|
// 5 blocks prefetched - Current + 4 additional (readahead_size).
|
|
|
|
{
|
|
|
|
// Check the behavior when it's -
|
|
|
|
// Miss(200), Hit(210), Hit(225), Hit(240), Hit(255).
|
|
|
|
// It should only prefetch current block (200).
|
|
|
|
{
|
|
|
|
std::vector<std::string> warm_keys{"00000210", "00000225", "00000240",
|
|
|
|
"00000255"};
|
|
|
|
WarmUpCache(&c, moptions, warm_keys);
|
|
|
|
|
2023-12-14 19:25:51 +00:00
|
|
|
ASSERT_OK(options.statistics->Reset());
|
|
|
|
|
2023-12-06 21:48:15 +00:00
|
|
|
std::unique_ptr<InternalIterator> iter(c.GetTableReader()->NewIterator(
|
|
|
|
read_options, moptions.prefix_extractor.get(), /*arena=*/nullptr,
|
|
|
|
/*skip_filters=*/false, TableReaderCaller::kUncategorized));
|
|
|
|
|
|
|
|
// Seek key -
|
|
|
|
InternalKey ikey("00000200", 0, kTypeValue);
|
|
|
|
auto kv_iter = kvmap.find(ikey.Encode().ToString());
|
|
|
|
|
|
|
|
iter->Seek(kv_iter->first);
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ(iter->key(), kv_iter->first);
|
|
|
|
ASSERT_EQ(iter->value().ToString(), kv_iter->second);
|
|
|
|
|
|
|
|
FilePrefetchBuffer* prefetch_buffer =
|
Prefer static_cast in place of most reinterpret_cast (#12308)
Summary:
The following are risks associated with pointer-to-pointer reinterpret_cast:
* Can produce the "wrong result" (crash or memory corruption). IIRC, in theory this can happen for any up-cast or down-cast for a non-standard-layout type, though in practice would only happen for multiple inheritance cases (where the base class pointer might be "inside" the derived object). We don't use multiple inheritance a lot, but we do.
* Can mask useful compiler errors upon code change, including converting between unrelated pointer types that you are expecting to be related, and converting between pointer and scalar types unintentionally.
I can only think of some obscure cases where static_cast could be troublesome when it compiles as a replacement:
* Going through `void*` could plausibly cause unnecessary or broken pointer arithmetic. Suppose we have
`struct Derived: public Base1, public Base2`. If we have `Derived*` -> `void*` -> `Base2*` -> `Derived*` through reinterpret casts, this could plausibly work (though technical UB) assuming the `Base2*` is not dereferenced. Changing to static cast could introduce breaking pointer arithmetic.
* Unnecessary (but safe) pointer arithmetic could arise in a case like `Derived*` -> `Base2*` -> `Derived*` where before the Base2 pointer might not have been dereferenced. This could potentially affect performance.
With some light scripting, I tried replacing pointer-to-pointer reinterpret_casts with static_cast and kept the cases that still compile. Most occurrences of reinterpret_cast have successfully been changed (except for java/ and third-party/). 294 changed, 257 remain.
A couple of related interventions included here:
* Previously Cache::Handle was not actually derived from in the implementations and just used as a `void*` stand-in with reinterpret_cast. Now there is a relationship to allow static_cast. In theory, this could introduce pointer arithmetic (as described above) but is unlikely without multiple inheritance AND non-empty Cache::Handle.
* Remove some unnecessary casts to void* as this is allowed to be implicit (for better or worse).
Most of the remaining reinterpret_casts are for converting to/from raw bytes of objects. We could consider better idioms for these patterns in follow-up work.
I wish there were a way to implement a template variant of static_cast that would only compile if no pointer arithmetic is generated, but best I can tell, this is not possible. AFAIK the best you could do is a dynamic check that the void* conversion after the static cast is unchanged.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12308
Test Plan: existing tests, CI
Reviewed By: ltamasi
Differential Revision: D53204947
Pulled By: pdillinger
fbshipit-source-id: 9de23e618263b0d5b9820f4e15966876888a16e2
2024-02-07 18:44:11 +00:00
|
|
|
(static_cast<BlockBasedTableIterator*>(iter.get()))
|
2023-12-06 21:48:15 +00:00
|
|
|
->prefetch_buffer();
|
2024-01-05 17:29:01 +00:00
|
|
|
std::vector<std::pair<uint64_t, size_t>> buffer_info(1);
|
|
|
|
prefetch_buffer->TEST_GetBufferOffsetandSize(buffer_info);
|
|
|
|
|
2023-12-06 21:48:15 +00:00
|
|
|
bbt->TEST_GetDataBlockHandle(read_options, kv_iter->first, block_handle);
|
|
|
|
// It won't prefetch the data of cache hit.
|
|
|
|
// One block data.
|
2024-01-05 17:29:01 +00:00
|
|
|
ASSERT_EQ(buffer_info[0].second, 4096);
|
|
|
|
ASSERT_EQ(buffer_info[0].first, block_handle.offset());
|
2023-12-14 19:25:51 +00:00
|
|
|
|
|
|
|
ASSERT_EQ(options.statistics->getAndResetTickerCount(READAHEAD_TRIMMED),
|
|
|
|
1);
|
2023-12-06 21:48:15 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
// Check the behavior when it's -
|
|
|
|
// First Prefetch - Miss(315), Miss(330), Miss(345), Hit(360), Hit(375),
|
|
|
|
// Second Prefetch - Miss(390), Miss(405) ...
|
|
|
|
// First prefetch should only prefetch from 315 to 345.
|
|
|
|
std::vector<std::string> warm_keys{"00000360", "00000375"};
|
|
|
|
WarmUpCache(&c, moptions, warm_keys);
|
|
|
|
|
|
|
|
std::unique_ptr<InternalIterator> iter(c.GetTableReader()->NewIterator(
|
|
|
|
read_options, moptions.prefix_extractor.get(), nullptr, false,
|
|
|
|
TableReaderCaller::kUncategorized));
|
|
|
|
|
|
|
|
// Seek key -
|
|
|
|
InternalKey ikey("00000315", 0, kTypeValue);
|
|
|
|
auto kv_iter = kvmap.find(ikey.Encode().ToString());
|
|
|
|
|
|
|
|
iter->Seek(kv_iter->first);
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ(iter->key(), kv_iter->first);
|
|
|
|
ASSERT_EQ(iter->value().ToString(), kv_iter->second);
|
|
|
|
|
|
|
|
FilePrefetchBuffer* prefetch_buffer =
|
Prefer static_cast in place of most reinterpret_cast (#12308)
Summary:
The following are risks associated with pointer-to-pointer reinterpret_cast:
* Can produce the "wrong result" (crash or memory corruption). IIRC, in theory this can happen for any up-cast or down-cast for a non-standard-layout type, though in practice would only happen for multiple inheritance cases (where the base class pointer might be "inside" the derived object). We don't use multiple inheritance a lot, but we do.
* Can mask useful compiler errors upon code change, including converting between unrelated pointer types that you are expecting to be related, and converting between pointer and scalar types unintentionally.
I can only think of some obscure cases where static_cast could be troublesome when it compiles as a replacement:
* Going through `void*` could plausibly cause unnecessary or broken pointer arithmetic. Suppose we have
`struct Derived: public Base1, public Base2`. If we have `Derived*` -> `void*` -> `Base2*` -> `Derived*` through reinterpret casts, this could plausibly work (though technical UB) assuming the `Base2*` is not dereferenced. Changing to static cast could introduce breaking pointer arithmetic.
* Unnecessary (but safe) pointer arithmetic could arise in a case like `Derived*` -> `Base2*` -> `Derived*` where before the Base2 pointer might not have been dereferenced. This could potentially affect performance.
With some light scripting, I tried replacing pointer-to-pointer reinterpret_casts with static_cast and kept the cases that still compile. Most occurrences of reinterpret_cast have successfully been changed (except for java/ and third-party/). 294 changed, 257 remain.
A couple of related interventions included here:
* Previously Cache::Handle was not actually derived from in the implementations and just used as a `void*` stand-in with reinterpret_cast. Now there is a relationship to allow static_cast. In theory, this could introduce pointer arithmetic (as described above) but is unlikely without multiple inheritance AND non-empty Cache::Handle.
* Remove some unnecessary casts to void* as this is allowed to be implicit (for better or worse).
Most of the remaining reinterpret_casts are for converting to/from raw bytes of objects. We could consider better idioms for these patterns in follow-up work.
I wish there were a way to implement a template variant of static_cast that would only compile if no pointer arithmetic is generated, but best I can tell, this is not possible. AFAIK the best you could do is a dynamic check that the void* conversion after the static cast is unchanged.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12308
Test Plan: existing tests, CI
Reviewed By: ltamasi
Differential Revision: D53204947
Pulled By: pdillinger
fbshipit-source-id: 9de23e618263b0d5b9820f4e15966876888a16e2
2024-02-07 18:44:11 +00:00
|
|
|
(static_cast<BlockBasedTableIterator*>(iter.get()))
|
2023-12-06 21:48:15 +00:00
|
|
|
->prefetch_buffer();
|
2024-01-05 17:29:01 +00:00
|
|
|
std::vector<std::pair<uint64_t, size_t>> buffer_info(1);
|
|
|
|
prefetch_buffer->TEST_GetBufferOffsetandSize(buffer_info);
|
2023-12-06 21:48:15 +00:00
|
|
|
bbt->TEST_GetDataBlockHandle(read_options, kv_iter->first, block_handle);
|
|
|
|
|
|
|
|
// It won't prefetch the data of cache hit.
|
|
|
|
// 3 blocks data.
|
2024-01-05 17:29:01 +00:00
|
|
|
ASSERT_EQ(buffer_info[0].second, 12288);
|
|
|
|
ASSERT_EQ(buffer_info[0].first, block_handle.offset());
|
2023-12-06 21:48:15 +00:00
|
|
|
|
|
|
|
for (; kv_iter != kvmap.end() && iter->Valid(); kv_iter++) {
|
|
|
|
ASSERT_EQ(iter->key(), kv_iter->first);
|
|
|
|
ASSERT_EQ(iter->value().ToString(), kv_iter->second);
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
|
|
|
|
if (iter->user_key().ToString() == "00000400") {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Second Prefetch.
|
2024-01-05 17:29:01 +00:00
|
|
|
prefetch_buffer->TEST_GetBufferOffsetandSize(buffer_info);
|
2023-12-06 21:48:15 +00:00
|
|
|
bbt->TEST_GetDataBlockHandle(read_options, kv_iter->first, block_handle);
|
2024-01-05 17:29:01 +00:00
|
|
|
ASSERT_EQ(buffer_info[0].second, 20480);
|
|
|
|
ASSERT_EQ(buffer_info[0].first, block_handle.offset());
|
2023-12-14 19:25:51 +00:00
|
|
|
|
|
|
|
ASSERT_EQ(options.statistics->getAndResetTickerCount(READAHEAD_TRIMMED),
|
|
|
|
1);
|
2023-12-06 21:48:15 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
c.ResetTableReader();
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(BlockBasedTableTest, BlockCacheLookupAsyncScansSeek) {
|
|
|
|
Options options;
|
|
|
|
TableConstructor c(BytewiseComparator());
|
|
|
|
std::unique_ptr<Env> env(
|
|
|
|
new CompositeEnvWrapper(c.env_, FileSystem::Default()));
|
|
|
|
options.env = env.get();
|
2024-04-27 03:05:30 +00:00
|
|
|
options.compression = kNoCompression;
|
2023-12-14 19:25:51 +00:00
|
|
|
options.statistics = CreateDBStatistics();
|
2023-12-06 21:48:15 +00:00
|
|
|
c.env_ = env.get();
|
|
|
|
|
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
|
|
|
options.create_if_missing = true;
|
|
|
|
table_options.index_type =
|
|
|
|
BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
|
|
|
|
table_options.block_cache = NewLRUCache(1024 * 1024, 0);
|
|
|
|
table_options.cache_index_and_filter_blocks = true;
|
|
|
|
table_options.filter_policy.reset(NewBloomFilterPolicy(10, true));
|
|
|
|
table_options.block_align = true;
|
|
|
|
options.table_factory.reset(new BlockBasedTableFactory(table_options));
|
2024-04-27 03:05:30 +00:00
|
|
|
ASSERT_OK(options.table_factory->ValidateOptions(
|
|
|
|
DBOptions(options), ColumnFamilyOptions(options)));
|
2023-12-06 21:48:15 +00:00
|
|
|
|
|
|
|
GenerateKVMap(&c);
|
|
|
|
|
|
|
|
std::vector<std::string> keys;
|
|
|
|
stl_wrappers::KVMap kvmap;
|
|
|
|
ImmutableOptions ioptions(options);
|
|
|
|
MutableCFOptions moptions(options);
|
|
|
|
const InternalKeyComparator internal_comparator(options.comparator);
|
|
|
|
|
|
|
|
c.Finish(options, ioptions, moptions, table_options, internal_comparator,
|
|
|
|
&keys, &kvmap);
|
|
|
|
|
Prefer static_cast in place of most reinterpret_cast (#12308)
Summary:
The following are risks associated with pointer-to-pointer reinterpret_cast:
* Can produce the "wrong result" (crash or memory corruption). IIRC, in theory this can happen for any up-cast or down-cast for a non-standard-layout type, though in practice would only happen for multiple inheritance cases (where the base class pointer might be "inside" the derived object). We don't use multiple inheritance a lot, but we do.
* Can mask useful compiler errors upon code change, including converting between unrelated pointer types that you are expecting to be related, and converting between pointer and scalar types unintentionally.
I can only think of some obscure cases where static_cast could be troublesome when it compiles as a replacement:
* Going through `void*` could plausibly cause unnecessary or broken pointer arithmetic. Suppose we have
`struct Derived: public Base1, public Base2`. If we have `Derived*` -> `void*` -> `Base2*` -> `Derived*` through reinterpret casts, this could plausibly work (though technical UB) assuming the `Base2*` is not dereferenced. Changing to static cast could introduce breaking pointer arithmetic.
* Unnecessary (but safe) pointer arithmetic could arise in a case like `Derived*` -> `Base2*` -> `Derived*` where before the Base2 pointer might not have been dereferenced. This could potentially affect performance.
With some light scripting, I tried replacing pointer-to-pointer reinterpret_casts with static_cast and kept the cases that still compile. Most occurrences of reinterpret_cast have successfully been changed (except for java/ and third-party/). 294 changed, 257 remain.
A couple of related interventions included here:
* Previously Cache::Handle was not actually derived from in the implementations and just used as a `void*` stand-in with reinterpret_cast. Now there is a relationship to allow static_cast. In theory, this could introduce pointer arithmetic (as described above) but is unlikely without multiple inheritance AND non-empty Cache::Handle.
* Remove some unnecessary casts to void* as this is allowed to be implicit (for better or worse).
Most of the remaining reinterpret_casts are for converting to/from raw bytes of objects. We could consider better idioms for these patterns in follow-up work.
I wish there were a way to implement a template variant of static_cast that would only compile if no pointer arithmetic is generated, but best I can tell, this is not possible. AFAIK the best you could do is a dynamic check that the void* conversion after the static cast is unchanged.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12308
Test Plan: existing tests, CI
Reviewed By: ltamasi
Differential Revision: D53204947
Pulled By: pdillinger
fbshipit-source-id: 9de23e618263b0d5b9820f4e15966876888a16e2
2024-02-07 18:44:11 +00:00
|
|
|
BlockBasedTable* bbt = static_cast<BlockBasedTable*>(c.GetTableReader());
|
2023-12-06 21:48:15 +00:00
|
|
|
BlockHandle block_handle;
|
|
|
|
|
|
|
|
ReadOptions read_options;
|
|
|
|
read_options.auto_readahead_size = true;
|
|
|
|
Slice ub = Slice("00000805");
|
|
|
|
Slice* ub_ptr = &ub;
|
|
|
|
read_options.iterate_upper_bound = ub_ptr;
|
|
|
|
read_options.readahead_size = 16384;
|
|
|
|
read_options.async_io = true;
|
|
|
|
|
|
|
|
// Test Various functionalities -
|
|
|
|
// 3 blocks prefetched - Current + 2 additional (readahead_size/2).
|
|
|
|
{
|
|
|
|
// Check the behavior when it's -
|
|
|
|
// 1st Prefetch - Miss(200), Hit(210), Hit(225),
|
|
|
|
// 2nd Prefetch - Hit(240), Hit(255)
|
|
|
|
// First Prefetch will be for 200 offset.
|
|
|
|
// Second prefetch will be 0.
|
|
|
|
{
|
|
|
|
std::vector<std::string> warm_keys{"00000210", "00000225", "00000240",
|
|
|
|
"00000255"};
|
|
|
|
WarmUpCache(&c, moptions, warm_keys);
|
|
|
|
|
2023-12-14 19:25:51 +00:00
|
|
|
ASSERT_OK(options.statistics->Reset());
|
|
|
|
|
2023-12-06 21:48:15 +00:00
|
|
|
std::unique_ptr<InternalIterator> iter(c.GetTableReader()->NewIterator(
|
|
|
|
read_options, moptions.prefix_extractor.get(), nullptr, false,
|
|
|
|
TableReaderCaller::kUncategorized));
|
|
|
|
|
|
|
|
// Seek key -
|
|
|
|
InternalKey ikey("00000200", 0, kTypeValue);
|
|
|
|
auto kv_iter = kvmap.find(ikey.Encode().ToString());
|
|
|
|
|
|
|
|
iter->Seek(kv_iter->first);
|
|
|
|
ASSERT_TRUE(iter->status().IsTryAgain());
|
|
|
|
iter->Seek(kv_iter->first);
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ(iter->key(), kv_iter->first);
|
|
|
|
ASSERT_EQ(iter->value().ToString(), kv_iter->second);
|
|
|
|
|
|
|
|
FilePrefetchBuffer* prefetch_buffer =
|
Prefer static_cast in place of most reinterpret_cast (#12308)
Summary:
The following are risks associated with pointer-to-pointer reinterpret_cast:
* Can produce the "wrong result" (crash or memory corruption). IIRC, in theory this can happen for any up-cast or down-cast for a non-standard-layout type, though in practice would only happen for multiple inheritance cases (where the base class pointer might be "inside" the derived object). We don't use multiple inheritance a lot, but we do.
* Can mask useful compiler errors upon code change, including converting between unrelated pointer types that you are expecting to be related, and converting between pointer and scalar types unintentionally.
I can only think of some obscure cases where static_cast could be troublesome when it compiles as a replacement:
* Going through `void*` could plausibly cause unnecessary or broken pointer arithmetic. Suppose we have
`struct Derived: public Base1, public Base2`. If we have `Derived*` -> `void*` -> `Base2*` -> `Derived*` through reinterpret casts, this could plausibly work (though technical UB) assuming the `Base2*` is not dereferenced. Changing to static cast could introduce breaking pointer arithmetic.
* Unnecessary (but safe) pointer arithmetic could arise in a case like `Derived*` -> `Base2*` -> `Derived*` where before the Base2 pointer might not have been dereferenced. This could potentially affect performance.
With some light scripting, I tried replacing pointer-to-pointer reinterpret_casts with static_cast and kept the cases that still compile. Most occurrences of reinterpret_cast have successfully been changed (except for java/ and third-party/). 294 changed, 257 remain.
A couple of related interventions included here:
* Previously Cache::Handle was not actually derived from in the implementations and just used as a `void*` stand-in with reinterpret_cast. Now there is a relationship to allow static_cast. In theory, this could introduce pointer arithmetic (as described above) but is unlikely without multiple inheritance AND non-empty Cache::Handle.
* Remove some unnecessary casts to void* as this is allowed to be implicit (for better or worse).
Most of the remaining reinterpret_casts are for converting to/from raw bytes of objects. We could consider better idioms for these patterns in follow-up work.
I wish there were a way to implement a template variant of static_cast that would only compile if no pointer arithmetic is generated, but best I can tell, this is not possible. AFAIK the best you could do is a dynamic check that the void* conversion after the static cast is unchanged.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12308
Test Plan: existing tests, CI
Reviewed By: ltamasi
Differential Revision: D53204947
Pulled By: pdillinger
fbshipit-source-id: 9de23e618263b0d5b9820f4e15966876888a16e2
2024-02-07 18:44:11 +00:00
|
|
|
(static_cast<BlockBasedTableIterator*>(iter.get()))
|
2023-12-06 21:48:15 +00:00
|
|
|
->prefetch_buffer();
|
2024-01-05 17:29:01 +00:00
|
|
|
std::vector<std::pair<uint64_t, size_t>> buffer_info(2);
|
|
|
|
prefetch_buffer->TEST_GetBufferOffsetandSize(buffer_info);
|
|
|
|
|
2023-12-06 21:48:15 +00:00
|
|
|
bbt->TEST_GetDataBlockHandle(read_options, kv_iter->first, block_handle);
|
2024-01-05 17:29:01 +00:00
|
|
|
ASSERT_EQ(buffer_info[0].first, block_handle.offset());
|
|
|
|
ASSERT_EQ(buffer_info[0].second, 4096);
|
|
|
|
ASSERT_EQ(buffer_info[1].second, 0);
|
2023-12-14 19:25:51 +00:00
|
|
|
|
|
|
|
ASSERT_EQ(options.statistics->getAndResetTickerCount(READAHEAD_TRIMMED),
|
|
|
|
2);
|
2023-12-06 21:48:15 +00:00
|
|
|
}
|
|
|
|
{
|
|
|
|
// Check the behavior when it's -
|
|
|
|
// First Prefetch - Miss(315), Miss(330), Hit(345),
|
|
|
|
// Second Prefetch - Miss(360), Miss(375), ...
|
|
|
|
// First prefetch should only prefetch from 315 to 330.
|
|
|
|
// Second prefetch should start from 360.
|
|
|
|
std::vector<std::string> warm_keys{"00000345"};
|
|
|
|
WarmUpCache(&c, moptions, warm_keys);
|
|
|
|
|
|
|
|
std::unique_ptr<InternalIterator> iter(c.GetTableReader()->NewIterator(
|
|
|
|
read_options, moptions.prefix_extractor.get(), nullptr, false,
|
|
|
|
TableReaderCaller::kUncategorized));
|
|
|
|
|
|
|
|
// Seek key -
|
|
|
|
InternalKey ikey("00000315", 0, kTypeValue);
|
|
|
|
auto kv_iter = kvmap.find(ikey.Encode().ToString());
|
|
|
|
|
|
|
|
iter->Seek(kv_iter->first);
|
|
|
|
ASSERT_TRUE(iter->status().IsTryAgain());
|
|
|
|
iter->Seek(kv_iter->first);
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ(iter->key(), kv_iter->first);
|
|
|
|
ASSERT_EQ(iter->value().ToString(), kv_iter->second);
|
|
|
|
|
|
|
|
FilePrefetchBuffer* prefetch_buffer =
|
Prefer static_cast in place of most reinterpret_cast (#12308)
Summary:
The following are risks associated with pointer-to-pointer reinterpret_cast:
* Can produce the "wrong result" (crash or memory corruption). IIRC, in theory this can happen for any up-cast or down-cast for a non-standard-layout type, though in practice would only happen for multiple inheritance cases (where the base class pointer might be "inside" the derived object). We don't use multiple inheritance a lot, but we do.
* Can mask useful compiler errors upon code change, including converting between unrelated pointer types that you are expecting to be related, and converting between pointer and scalar types unintentionally.
I can only think of some obscure cases where static_cast could be troublesome when it compiles as a replacement:
* Going through `void*` could plausibly cause unnecessary or broken pointer arithmetic. Suppose we have
`struct Derived: public Base1, public Base2`. If we have `Derived*` -> `void*` -> `Base2*` -> `Derived*` through reinterpret casts, this could plausibly work (though technical UB) assuming the `Base2*` is not dereferenced. Changing to static cast could introduce breaking pointer arithmetic.
* Unnecessary (but safe) pointer arithmetic could arise in a case like `Derived*` -> `Base2*` -> `Derived*` where before the Base2 pointer might not have been dereferenced. This could potentially affect performance.
With some light scripting, I tried replacing pointer-to-pointer reinterpret_casts with static_cast and kept the cases that still compile. Most occurrences of reinterpret_cast have successfully been changed (except for java/ and third-party/). 294 changed, 257 remain.
A couple of related interventions included here:
* Previously Cache::Handle was not actually derived from in the implementations and just used as a `void*` stand-in with reinterpret_cast. Now there is a relationship to allow static_cast. In theory, this could introduce pointer arithmetic (as described above) but is unlikely without multiple inheritance AND non-empty Cache::Handle.
* Remove some unnecessary casts to void* as this is allowed to be implicit (for better or worse).
Most of the remaining reinterpret_casts are for converting to/from raw bytes of objects. We could consider better idioms for these patterns in follow-up work.
I wish there were a way to implement a template variant of static_cast that would only compile if no pointer arithmetic is generated, but best I can tell, this is not possible. AFAIK the best you could do is a dynamic check that the void* conversion after the static cast is unchanged.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12308
Test Plan: existing tests, CI
Reviewed By: ltamasi
Differential Revision: D53204947
Pulled By: pdillinger
fbshipit-source-id: 9de23e618263b0d5b9820f4e15966876888a16e2
2024-02-07 18:44:11 +00:00
|
|
|
(static_cast<BlockBasedTableIterator*>(iter.get()))
|
2023-12-06 21:48:15 +00:00
|
|
|
->prefetch_buffer();
|
2024-01-05 17:29:01 +00:00
|
|
|
std::vector<std::pair<uint64_t, size_t>> buffer_info(2);
|
|
|
|
prefetch_buffer->TEST_GetBufferOffsetandSize(buffer_info);
|
2023-12-06 21:48:15 +00:00
|
|
|
{
|
|
|
|
// 1st Buffer Verification.
|
|
|
|
bbt->TEST_GetDataBlockHandle(read_options, kv_iter->first,
|
|
|
|
block_handle);
|
2024-01-05 17:29:01 +00:00
|
|
|
ASSERT_EQ(buffer_info[0].first, block_handle.offset());
|
|
|
|
ASSERT_EQ(buffer_info[0].second, 8192);
|
2023-12-06 21:48:15 +00:00
|
|
|
|
|
|
|
// 2nd Buffer Verification.
|
|
|
|
InternalKey ikey_tmp("00000360", 0, kTypeValue);
|
|
|
|
bbt->TEST_GetDataBlockHandle(read_options, ikey_tmp.Encode().ToString(),
|
|
|
|
block_handle);
|
2024-01-05 17:29:01 +00:00
|
|
|
ASSERT_EQ(buffer_info[1].first, block_handle.offset());
|
|
|
|
ASSERT_EQ(buffer_info[1].second, 8192);
|
2023-12-14 19:25:51 +00:00
|
|
|
|
|
|
|
ASSERT_EQ(options.statistics->getAndResetTickerCount(READAHEAD_TRIMMED),
|
|
|
|
1);
|
2023-12-06 21:48:15 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
// Check the behavior when it's -
|
|
|
|
// First Prefetch - Miss(495), Miss(510), Hit(525), prefetch len- 8192
|
|
|
|
// Second Prefetch async - Miss(540), Miss(555), - 8192
|
|
|
|
// Third Prefetch Async - Hit(570), Miss(585), - 4096
|
|
|
|
// 4th Prefetch Async - Hit(600), Miss(615), - 4096
|
|
|
|
// 5th Prefetch Async - Miss(630), Miss(645) - 8192
|
|
|
|
std::vector<std::string> warm_keys{"00000525", "00000570", "00000600"};
|
|
|
|
WarmUpCache(&c, moptions, warm_keys);
|
|
|
|
|
|
|
|
std::unique_ptr<InternalIterator> iter(c.GetTableReader()->NewIterator(
|
|
|
|
read_options, moptions.prefix_extractor.get(), /*arena=*/nullptr,
|
|
|
|
/*skip_filters=*/false, TableReaderCaller::kUncategorized));
|
|
|
|
|
|
|
|
// Seek key -
|
|
|
|
InternalKey ikey("00000495", 0, kTypeValue);
|
|
|
|
auto kv_iter = kvmap.find(ikey.Encode().ToString());
|
|
|
|
|
|
|
|
// First and Second Prefetch.
|
|
|
|
iter->Seek(kv_iter->first);
|
|
|
|
ASSERT_TRUE(iter->status().IsTryAgain());
|
|
|
|
iter->Seek(kv_iter->first);
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ(iter->key(), kv_iter->first);
|
|
|
|
ASSERT_EQ(iter->value().ToString(), kv_iter->second);
|
|
|
|
|
|
|
|
FilePrefetchBuffer* prefetch_buffer =
|
Prefer static_cast in place of most reinterpret_cast (#12308)
Summary:
The following are risks associated with pointer-to-pointer reinterpret_cast:
* Can produce the "wrong result" (crash or memory corruption). IIRC, in theory this can happen for any up-cast or down-cast for a non-standard-layout type, though in practice would only happen for multiple inheritance cases (where the base class pointer might be "inside" the derived object). We don't use multiple inheritance a lot, but we do.
* Can mask useful compiler errors upon code change, including converting between unrelated pointer types that you are expecting to be related, and converting between pointer and scalar types unintentionally.
I can only think of some obscure cases where static_cast could be troublesome when it compiles as a replacement:
* Going through `void*` could plausibly cause unnecessary or broken pointer arithmetic. Suppose we have
`struct Derived: public Base1, public Base2`. If we have `Derived*` -> `void*` -> `Base2*` -> `Derived*` through reinterpret casts, this could plausibly work (though technical UB) assuming the `Base2*` is not dereferenced. Changing to static cast could introduce breaking pointer arithmetic.
* Unnecessary (but safe) pointer arithmetic could arise in a case like `Derived*` -> `Base2*` -> `Derived*` where before the Base2 pointer might not have been dereferenced. This could potentially affect performance.
With some light scripting, I tried replacing pointer-to-pointer reinterpret_casts with static_cast and kept the cases that still compile. Most occurrences of reinterpret_cast have successfully been changed (except for java/ and third-party/). 294 changed, 257 remain.
A couple of related interventions included here:
* Previously Cache::Handle was not actually derived from in the implementations and just used as a `void*` stand-in with reinterpret_cast. Now there is a relationship to allow static_cast. In theory, this could introduce pointer arithmetic (as described above) but is unlikely without multiple inheritance AND non-empty Cache::Handle.
* Remove some unnecessary casts to void* as this is allowed to be implicit (for better or worse).
Most of the remaining reinterpret_casts are for converting to/from raw bytes of objects. We could consider better idioms for these patterns in follow-up work.
I wish there were a way to implement a template variant of static_cast that would only compile if no pointer arithmetic is generated, but best I can tell, this is not possible. AFAIK the best you could do is a dynamic check that the void* conversion after the static cast is unchanged.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12308
Test Plan: existing tests, CI
Reviewed By: ltamasi
Differential Revision: D53204947
Pulled By: pdillinger
fbshipit-source-id: 9de23e618263b0d5b9820f4e15966876888a16e2
2024-02-07 18:44:11 +00:00
|
|
|
(static_cast<BlockBasedTableIterator*>(iter.get()))
|
2023-12-06 21:48:15 +00:00
|
|
|
->prefetch_buffer();
|
2024-01-05 17:29:01 +00:00
|
|
|
|
2023-12-06 21:48:15 +00:00
|
|
|
{
|
2024-01-05 17:29:01 +00:00
|
|
|
std::vector<std::pair<uint64_t, size_t>> buffer_info(2);
|
|
|
|
prefetch_buffer->TEST_GetBufferOffsetandSize(buffer_info);
|
|
|
|
|
2023-12-06 21:48:15 +00:00
|
|
|
// 1st Buffer Verification.
|
|
|
|
bbt->TEST_GetDataBlockHandle(read_options, kv_iter->first,
|
|
|
|
block_handle);
|
2024-01-05 17:29:01 +00:00
|
|
|
ASSERT_EQ(buffer_info[0].first, block_handle.offset());
|
|
|
|
ASSERT_EQ(buffer_info[0].second, 8192);
|
2023-12-06 21:48:15 +00:00
|
|
|
|
|
|
|
// 2nd Buffer Verification.
|
|
|
|
InternalKey ikey_tmp("00000540", 0, kTypeValue);
|
|
|
|
bbt->TEST_GetDataBlockHandle(read_options, ikey_tmp.Encode().ToString(),
|
|
|
|
block_handle);
|
2024-01-05 17:29:01 +00:00
|
|
|
ASSERT_EQ(buffer_info[1].first, block_handle.offset());
|
|
|
|
ASSERT_EQ(buffer_info[1].second, 8192);
|
2023-12-14 19:25:51 +00:00
|
|
|
|
|
|
|
ASSERT_EQ(options.statistics->getAndResetTickerCount(READAHEAD_TRIMMED),
|
|
|
|
1);
|
2023-12-06 21:48:15 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Third prefetch ReadAsync (buffers will swap).
|
|
|
|
for (; kv_iter != kvmap.end() && iter->Valid(); kv_iter++) {
|
|
|
|
ASSERT_EQ(iter->key(), kv_iter->first);
|
|
|
|
ASSERT_EQ(iter->value().ToString(), kv_iter->second);
|
|
|
|
|
|
|
|
if (iter->user_key() == "00000540") {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
}
|
|
|
|
|
|
|
|
{
|
2024-01-05 17:29:01 +00:00
|
|
|
std::vector<std::pair<uint64_t, size_t>> buffer_info(2);
|
|
|
|
prefetch_buffer->TEST_GetBufferOffsetandSize(buffer_info);
|
|
|
|
|
2023-12-06 21:48:15 +00:00
|
|
|
// 1st Buffer Verification.
|
|
|
|
bbt->TEST_GetDataBlockHandle(read_options, kv_iter->first,
|
|
|
|
block_handle);
|
2024-01-05 17:29:01 +00:00
|
|
|
ASSERT_EQ(buffer_info[0].first, block_handle.offset());
|
|
|
|
ASSERT_EQ(buffer_info[0].second, 8192);
|
2023-12-06 21:48:15 +00:00
|
|
|
|
|
|
|
// 2nd Buffer Verification.
|
|
|
|
InternalKey ikey_tmp("00000585", 0, kTypeValue);
|
|
|
|
bbt->TEST_GetDataBlockHandle(read_options, ikey_tmp.Encode().ToString(),
|
|
|
|
block_handle);
|
2024-01-05 17:29:01 +00:00
|
|
|
ASSERT_EQ(buffer_info[1].first, block_handle.offset());
|
|
|
|
ASSERT_EQ(buffer_info[1].second, 4096);
|
2023-12-14 19:25:51 +00:00
|
|
|
|
|
|
|
ASSERT_EQ(options.statistics->getAndResetTickerCount(READAHEAD_TRIMMED),
|
|
|
|
1);
|
2023-12-06 21:48:15 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// 4th Prefetch ReadAsync (buffers will swap).
|
|
|
|
for (; kv_iter != kvmap.end() && iter->Valid(); kv_iter++) {
|
|
|
|
ASSERT_EQ(iter->key(), kv_iter->first);
|
|
|
|
ASSERT_EQ(iter->value().ToString(), kv_iter->second);
|
|
|
|
|
|
|
|
if (iter->user_key() == "00000585") {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
}
|
|
|
|
|
|
|
|
{
|
2024-01-05 17:29:01 +00:00
|
|
|
std::vector<std::pair<uint64_t, size_t>> buffer_info(2);
|
|
|
|
prefetch_buffer->TEST_GetBufferOffsetandSize(buffer_info);
|
|
|
|
|
2023-12-06 21:48:15 +00:00
|
|
|
// 1st Buffer Verification.
|
|
|
|
bbt->TEST_GetDataBlockHandle(read_options, kv_iter->first,
|
|
|
|
block_handle);
|
2024-01-05 17:29:01 +00:00
|
|
|
ASSERT_EQ(buffer_info[0].first, block_handle.offset());
|
|
|
|
ASSERT_EQ(buffer_info[0].second, 4096);
|
2023-12-06 21:48:15 +00:00
|
|
|
|
|
|
|
// 2nd Buffer Verification.
|
|
|
|
InternalKey ikey_tmp("00000615", 0, kTypeValue);
|
|
|
|
bbt->TEST_GetDataBlockHandle(read_options, ikey_tmp.Encode().ToString(),
|
|
|
|
block_handle);
|
2024-01-05 17:29:01 +00:00
|
|
|
ASSERT_EQ(buffer_info[1].first, block_handle.offset());
|
|
|
|
ASSERT_EQ(buffer_info[1].second, 4096);
|
2023-12-14 19:25:51 +00:00
|
|
|
|
|
|
|
ASSERT_EQ(options.statistics->getAndResetTickerCount(READAHEAD_TRIMMED),
|
|
|
|
1);
|
2023-12-06 21:48:15 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// 5th Prefetch ReadAsync.
|
|
|
|
for (; kv_iter != kvmap.end() && iter->Valid(); kv_iter++) {
|
|
|
|
ASSERT_EQ(iter->key(), kv_iter->first);
|
|
|
|
ASSERT_EQ(iter->value().ToString(), kv_iter->second);
|
|
|
|
|
|
|
|
if (iter->user_key() == "00000615") {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
}
|
|
|
|
|
|
|
|
{
|
2024-01-05 17:29:01 +00:00
|
|
|
std::vector<std::pair<uint64_t, size_t>> buffer_info(2);
|
|
|
|
prefetch_buffer->TEST_GetBufferOffsetandSize(buffer_info);
|
|
|
|
|
2023-12-06 21:48:15 +00:00
|
|
|
// 1st Buffer Verification.
|
|
|
|
bbt->TEST_GetDataBlockHandle(read_options, kv_iter->first,
|
|
|
|
block_handle);
|
2024-01-05 17:29:01 +00:00
|
|
|
ASSERT_EQ(buffer_info[0].first, block_handle.offset());
|
|
|
|
ASSERT_EQ(buffer_info[0].second, 4096);
|
2023-12-06 21:48:15 +00:00
|
|
|
|
|
|
|
// 2nd Buffer Verification.
|
|
|
|
InternalKey ikey_tmp("00000630", 0, kTypeValue);
|
|
|
|
bbt->TEST_GetDataBlockHandle(read_options, ikey_tmp.Encode().ToString(),
|
|
|
|
block_handle);
|
2024-01-05 17:29:01 +00:00
|
|
|
ASSERT_EQ(buffer_info[1].first, block_handle.offset());
|
|
|
|
ASSERT_EQ(buffer_info[1].second, 8192);
|
2023-12-14 19:25:51 +00:00
|
|
|
|
|
|
|
ASSERT_EQ(options.statistics->getAndResetTickerCount(READAHEAD_TRIMMED),
|
|
|
|
0);
|
2023-12-06 21:48:15 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
c.ResetTableReader();
|
|
|
|
}
|
|
|
|
|
Refactor block cache tracing w/improved MultiGet (#11339)
Summary:
After https://github.com/facebook/rocksdb/issues/11301, I wasn't sure whether I had regressed block cache tracing with MultiGet. Demo PR https://github.com/facebook/rocksdb/issues/11330 shows the flawed state of tracing MultiGet before my change, and based on the unit test, there was essentially no change in tracing behavior with https://github.com/facebook/rocksdb/issues/11301. This change is to leave that code and behavior better than I found it.
This change is not intended to change any production behaviors except when block cache tracing is active, though might improve general read path efficiency by disabling some related tracking when such tracing is disabled.
More detail on production code:
* Refactoring to consolidate the construction of BlockCacheTraceRecord, and other related functionality, in block-based table reader, though it's somewhat awkward to preserve an optimization to avoid copying Slices into temporary strings in BlockCacheLookupContext.
* Accurately track cache hits and misses (etc.) for each data block accessed by a MultiGet(). (Previously reported hits as misses.)
* Reduced repeated checking of `block_cache_tracer_` state (by creating lookup_context only when active) for efficiency and to reduce the risk of corner case bugs where tracing is enabled or disabled for different parts of a read op. (See a TODO below)
* Improved estimate calculation for num_keys_in_block (see code comment)
Possible follow-up:
* `XXX:` use_cache=true means double cache query? (possible double-query of block cache when allow_mmap_reads=true)
* `TODO:` need more than one lookup_context here to track individual filter and index partition hits and misses
* `TODO:` optimize more state checks of `block_cache_tracer_` down to `lookup_context != nullptr`
* Pre-existing `XXX:` There appear to be 'break' statements above that bypass this writing of the block cache trace record
* Expand test coverage (see below)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11339
Test Plan:
* Added a basic unit test for block cache tracing MultiGet, for now just covering one data block with two keys.
* Added HitMissCountingCache to independently verify that the actual block cache trace and expected block cache trace also agree with the actual number of cache hits / misses (nothing missing or mislabeled). For now only used with MultiGet test.
* Better testing of num_keys_in_block, for now just with MultiGet
* Misc improvements to table_test to improve clarity, such as making it clear that certain keys are auto-inserted at the start of every test.
Performance test:
Testing multireadrandom as in https://github.com/facebook/rocksdb/issues/11301, except averaging over distinct runs rather than [-X30] which doesn't seem to sufficiently reset after each run to work as an independent test run.
Base with revert of 11301: 3148926 ops/sec
Base: 3019146 ops/sec
New: 2999529 ops/sec
Possibly a tiny MultiGet CPU regression with this change. We are now always allocating an additional vector for the LookupContexts. I'm still contemplating options to try to correct the regression in https://github.com/facebook/rocksdb/issues/11301.
Testing readrandom:
Base with revert of 11301: 2311988
Base: 2281726
New: 2299722
Possibly a tiny Get CPU improvement with this change. We are now avoiding some unnecessary LookupContext population.
Reviewed By: akankshamahajan15
Differential Revision: D44557845
Pulled By: pdillinger
fbshipit-source-id: b841691799d2a48fb59cc8880dc7cbb1e107ae3d
2023-04-07 19:55:56 +00:00
|
|
|
struct HitMissCountingCache : public CacheWrapper {
|
|
|
|
using CacheWrapper::CacheWrapper;
|
|
|
|
const char* Name() const override { return "HitMissCountingCache"; }
|
|
|
|
|
|
|
|
uint64_t hit_count_ = 0;
|
|
|
|
uint64_t miss_count_ = 0;
|
|
|
|
|
|
|
|
void Reset() {
|
|
|
|
hit_count_ = 0;
|
|
|
|
miss_count_ = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
Handle* Lookup(const Slice& key, const CacheItemHelper* helper,
|
|
|
|
CreateContext* create_context,
|
|
|
|
Priority priority = Priority::LOW,
|
|
|
|
Statistics* stats = nullptr) override {
|
|
|
|
// ASSUMES no blocking async lookups
|
|
|
|
Handle* h = target_->Lookup(key, helper, create_context, priority, stats);
|
|
|
|
if (h) {
|
|
|
|
hit_count_++;
|
|
|
|
} else {
|
|
|
|
miss_count_++;
|
|
|
|
}
|
|
|
|
return h;
|
|
|
|
}
|
|
|
|
|
|
|
|
void StartAsyncLookup(AsyncLookupHandle& async_handle) override {
|
|
|
|
target_->StartAsyncLookup(async_handle);
|
|
|
|
// If not pending, caller might not call WaitAll, so have to account here.
|
|
|
|
if (!async_handle.IsPending()) {
|
|
|
|
if (async_handle.Result()) {
|
|
|
|
hit_count_++;
|
|
|
|
} else {
|
|
|
|
miss_count_++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void WaitAll(AsyncLookupHandle* async_handles, size_t count) override {
|
|
|
|
// If !pending, then we already accounted for it in StartAsyncLookup.
|
|
|
|
// Assume the pending status does not change asynchronously (since
|
|
|
|
// StartAsyncLookup) and remember which still need accounting.
|
|
|
|
std::vector<AsyncLookupHandle*> needs_accounting;
|
|
|
|
for (size_t i = 0; i < count; ++i) {
|
|
|
|
if (async_handles[i].IsPending()) {
|
|
|
|
needs_accounting.push_back(async_handles + i);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
target_->WaitAll(async_handles, count);
|
|
|
|
for (auto ah : needs_accounting) {
|
|
|
|
if (ah->Result()) {
|
|
|
|
hit_count_++;
|
|
|
|
} else {
|
|
|
|
miss_count_++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void VerifyExpectedHitMissCounts(
|
|
|
|
const std::vector<BlockCacheTraceRecord>& expected_records) {
|
|
|
|
uint64_t expected_hits = 0;
|
|
|
|
uint64_t expected_misses = 0;
|
|
|
|
for (const auto& r : expected_records) {
|
|
|
|
if (r.is_cache_hit) {
|
|
|
|
expected_hits++;
|
|
|
|
} else {
|
|
|
|
expected_misses++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
EXPECT_EQ(expected_hits, hit_count_);
|
|
|
|
EXPECT_EQ(expected_misses, miss_count_);
|
|
|
|
Reset();
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
TEST_P(BlockBasedTableTest, TracingMultiGetTest) {
|
|
|
|
TableConstructor c(BytewiseComparator());
|
|
|
|
Options options;
|
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
|
|
|
options.create_if_missing = true;
|
|
|
|
auto cache =
|
|
|
|
std::make_shared<HitMissCountingCache>(NewLRUCache(1024 * 1024, 0));
|
|
|
|
table_options.block_cache = cache;
|
|
|
|
table_options.cache_index_and_filter_blocks = true;
|
|
|
|
table_options.filter_policy.reset(NewBloomFilterPolicy(10));
|
|
|
|
// Put auto_add_key1 and auto_add_key2 in the same data block
|
|
|
|
table_options.block_size = kDummyValue.size() * 2 + 100;
|
|
|
|
options.table_factory.reset(new BlockBasedTableFactory(table_options));
|
|
|
|
SetupTracingTest(&c);
|
|
|
|
std::vector<std::string> keys;
|
|
|
|
stl_wrappers::KVMap kvmap;
|
|
|
|
ImmutableOptions ioptions(options);
|
|
|
|
MutableCFOptions moptions(options);
|
|
|
|
c.Finish(options, ioptions, moptions, table_options,
|
|
|
|
GetPlainInternalComparator(options.comparator), &keys, &kvmap);
|
|
|
|
|
|
|
|
std::vector<BlockCacheTraceRecord> expected_records;
|
|
|
|
|
|
|
|
for (bool first_pass : {true, false}) {
|
|
|
|
uint64_t get_id_offset = first_pass ? 2 : 5;
|
|
|
|
ReadOptions ro;
|
|
|
|
std::array<Slice, 2> ukeys{{auto_add_key1, auto_add_key2}};
|
|
|
|
std::array<PinnableSlice, 2> values;
|
|
|
|
std::vector<GetContext> get_contexts;
|
|
|
|
get_contexts.emplace_back(
|
|
|
|
options.comparator, nullptr, nullptr, nullptr, GetContext::kNotFound,
|
2023-12-01 19:15:17 +00:00
|
|
|
ukeys[0], values.data(), nullptr, nullptr, nullptr, true, nullptr,
|
|
|
|
nullptr, nullptr, nullptr, nullptr, nullptr, get_id_offset);
|
Refactor block cache tracing w/improved MultiGet (#11339)
Summary:
After https://github.com/facebook/rocksdb/issues/11301, I wasn't sure whether I had regressed block cache tracing with MultiGet. Demo PR https://github.com/facebook/rocksdb/issues/11330 shows the flawed state of tracing MultiGet before my change, and based on the unit test, there was essentially no change in tracing behavior with https://github.com/facebook/rocksdb/issues/11301. This change is to leave that code and behavior better than I found it.
This change is not intended to change any production behaviors except when block cache tracing is active, though might improve general read path efficiency by disabling some related tracking when such tracing is disabled.
More detail on production code:
* Refactoring to consolidate the construction of BlockCacheTraceRecord, and other related functionality, in block-based table reader, though it's somewhat awkward to preserve an optimization to avoid copying Slices into temporary strings in BlockCacheLookupContext.
* Accurately track cache hits and misses (etc.) for each data block accessed by a MultiGet(). (Previously reported hits as misses.)
* Reduced repeated checking of `block_cache_tracer_` state (by creating lookup_context only when active) for efficiency and to reduce the risk of corner case bugs where tracing is enabled or disabled for different parts of a read op. (See a TODO below)
* Improved estimate calculation for num_keys_in_block (see code comment)
Possible follow-up:
* `XXX:` use_cache=true means double cache query? (possible double-query of block cache when allow_mmap_reads=true)
* `TODO:` need more than one lookup_context here to track individual filter and index partition hits and misses
* `TODO:` optimize more state checks of `block_cache_tracer_` down to `lookup_context != nullptr`
* Pre-existing `XXX:` There appear to be 'break' statements above that bypass this writing of the block cache trace record
* Expand test coverage (see below)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11339
Test Plan:
* Added a basic unit test for block cache tracing MultiGet, for now just covering one data block with two keys.
* Added HitMissCountingCache to independently verify that the actual block cache trace and expected block cache trace also agree with the actual number of cache hits / misses (nothing missing or mislabeled). For now only used with MultiGet test.
* Better testing of num_keys_in_block, for now just with MultiGet
* Misc improvements to table_test to improve clarity, such as making it clear that certain keys are auto-inserted at the start of every test.
Performance test:
Testing multireadrandom as in https://github.com/facebook/rocksdb/issues/11301, except averaging over distinct runs rather than [-X30] which doesn't seem to sufficiently reset after each run to work as an independent test run.
Base with revert of 11301: 3148926 ops/sec
Base: 3019146 ops/sec
New: 2999529 ops/sec
Possibly a tiny MultiGet CPU regression with this change. We are now always allocating an additional vector for the LookupContexts. I'm still contemplating options to try to correct the regression in https://github.com/facebook/rocksdb/issues/11301.
Testing readrandom:
Base with revert of 11301: 2311988
Base: 2281726
New: 2299722
Possibly a tiny Get CPU improvement with this change. We are now avoiding some unnecessary LookupContext population.
Reviewed By: akankshamahajan15
Differential Revision: D44557845
Pulled By: pdillinger
fbshipit-source-id: b841691799d2a48fb59cc8880dc7cbb1e107ae3d
2023-04-07 19:55:56 +00:00
|
|
|
get_contexts.emplace_back(
|
|
|
|
options.comparator, nullptr, nullptr, nullptr, GetContext::kNotFound,
|
|
|
|
ukeys[1], &values[1], nullptr, nullptr, nullptr, true, nullptr, nullptr,
|
|
|
|
nullptr, nullptr, nullptr, nullptr, get_id_offset + 1);
|
|
|
|
std::array<std::string, 2> encoded_keys;
|
|
|
|
encoded_keys[0] = InternalKey(ukeys[0], 0, kTypeValue).Encode().ToString();
|
|
|
|
encoded_keys[1] = InternalKey(ukeys[1], 0, kTypeValue).Encode().ToString();
|
|
|
|
std::array<Status, 2> statuses;
|
|
|
|
autovector<KeyContext, MultiGetContext::MAX_BATCH_SIZE> key_context;
|
|
|
|
key_context.emplace_back(/*ColumnFamilyHandle omitted*/ nullptr, ukeys[0],
|
2023-12-01 19:15:17 +00:00
|
|
|
values.data(),
|
Refactor block cache tracing w/improved MultiGet (#11339)
Summary:
After https://github.com/facebook/rocksdb/issues/11301, I wasn't sure whether I had regressed block cache tracing with MultiGet. Demo PR https://github.com/facebook/rocksdb/issues/11330 shows the flawed state of tracing MultiGet before my change, and based on the unit test, there was essentially no change in tracing behavior with https://github.com/facebook/rocksdb/issues/11301. This change is to leave that code and behavior better than I found it.
This change is not intended to change any production behaviors except when block cache tracing is active, though might improve general read path efficiency by disabling some related tracking when such tracing is disabled.
More detail on production code:
* Refactoring to consolidate the construction of BlockCacheTraceRecord, and other related functionality, in block-based table reader, though it's somewhat awkward to preserve an optimization to avoid copying Slices into temporary strings in BlockCacheLookupContext.
* Accurately track cache hits and misses (etc.) for each data block accessed by a MultiGet(). (Previously reported hits as misses.)
* Reduced repeated checking of `block_cache_tracer_` state (by creating lookup_context only when active) for efficiency and to reduce the risk of corner case bugs where tracing is enabled or disabled for different parts of a read op. (See a TODO below)
* Improved estimate calculation for num_keys_in_block (see code comment)
Possible follow-up:
* `XXX:` use_cache=true means double cache query? (possible double-query of block cache when allow_mmap_reads=true)
* `TODO:` need more than one lookup_context here to track individual filter and index partition hits and misses
* `TODO:` optimize more state checks of `block_cache_tracer_` down to `lookup_context != nullptr`
* Pre-existing `XXX:` There appear to be 'break' statements above that bypass this writing of the block cache trace record
* Expand test coverage (see below)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11339
Test Plan:
* Added a basic unit test for block cache tracing MultiGet, for now just covering one data block with two keys.
* Added HitMissCountingCache to independently verify that the actual block cache trace and expected block cache trace also agree with the actual number of cache hits / misses (nothing missing or mislabeled). For now only used with MultiGet test.
* Better testing of num_keys_in_block, for now just with MultiGet
* Misc improvements to table_test to improve clarity, such as making it clear that certain keys are auto-inserted at the start of every test.
Performance test:
Testing multireadrandom as in https://github.com/facebook/rocksdb/issues/11301, except averaging over distinct runs rather than [-X30] which doesn't seem to sufficiently reset after each run to work as an independent test run.
Base with revert of 11301: 3148926 ops/sec
Base: 3019146 ops/sec
New: 2999529 ops/sec
Possibly a tiny MultiGet CPU regression with this change. We are now always allocating an additional vector for the LookupContexts. I'm still contemplating options to try to correct the regression in https://github.com/facebook/rocksdb/issues/11301.
Testing readrandom:
Base with revert of 11301: 2311988
Base: 2281726
New: 2299722
Possibly a tiny Get CPU improvement with this change. We are now avoiding some unnecessary LookupContext population.
Reviewed By: akankshamahajan15
Differential Revision: D44557845
Pulled By: pdillinger
fbshipit-source-id: b841691799d2a48fb59cc8880dc7cbb1e107ae3d
2023-04-07 19:55:56 +00:00
|
|
|
/*PinnableWideColumns omitted*/ nullptr,
|
2023-12-01 19:15:17 +00:00
|
|
|
/*timestamp omitted*/ nullptr, statuses.data());
|
Refactor block cache tracing w/improved MultiGet (#11339)
Summary:
After https://github.com/facebook/rocksdb/issues/11301, I wasn't sure whether I had regressed block cache tracing with MultiGet. Demo PR https://github.com/facebook/rocksdb/issues/11330 shows the flawed state of tracing MultiGet before my change, and based on the unit test, there was essentially no change in tracing behavior with https://github.com/facebook/rocksdb/issues/11301. This change is to leave that code and behavior better than I found it.
This change is not intended to change any production behaviors except when block cache tracing is active, though might improve general read path efficiency by disabling some related tracking when such tracing is disabled.
More detail on production code:
* Refactoring to consolidate the construction of BlockCacheTraceRecord, and other related functionality, in block-based table reader, though it's somewhat awkward to preserve an optimization to avoid copying Slices into temporary strings in BlockCacheLookupContext.
* Accurately track cache hits and misses (etc.) for each data block accessed by a MultiGet(). (Previously reported hits as misses.)
* Reduced repeated checking of `block_cache_tracer_` state (by creating lookup_context only when active) for efficiency and to reduce the risk of corner case bugs where tracing is enabled or disabled for different parts of a read op. (See a TODO below)
* Improved estimate calculation for num_keys_in_block (see code comment)
Possible follow-up:
* `XXX:` use_cache=true means double cache query? (possible double-query of block cache when allow_mmap_reads=true)
* `TODO:` need more than one lookup_context here to track individual filter and index partition hits and misses
* `TODO:` optimize more state checks of `block_cache_tracer_` down to `lookup_context != nullptr`
* Pre-existing `XXX:` There appear to be 'break' statements above that bypass this writing of the block cache trace record
* Expand test coverage (see below)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11339
Test Plan:
* Added a basic unit test for block cache tracing MultiGet, for now just covering one data block with two keys.
* Added HitMissCountingCache to independently verify that the actual block cache trace and expected block cache trace also agree with the actual number of cache hits / misses (nothing missing or mislabeled). For now only used with MultiGet test.
* Better testing of num_keys_in_block, for now just with MultiGet
* Misc improvements to table_test to improve clarity, such as making it clear that certain keys are auto-inserted at the start of every test.
Performance test:
Testing multireadrandom as in https://github.com/facebook/rocksdb/issues/11301, except averaging over distinct runs rather than [-X30] which doesn't seem to sufficiently reset after each run to work as an independent test run.
Base with revert of 11301: 3148926 ops/sec
Base: 3019146 ops/sec
New: 2999529 ops/sec
Possibly a tiny MultiGet CPU regression with this change. We are now always allocating an additional vector for the LookupContexts. I'm still contemplating options to try to correct the regression in https://github.com/facebook/rocksdb/issues/11301.
Testing readrandom:
Base with revert of 11301: 2311988
Base: 2281726
New: 2299722
Possibly a tiny Get CPU improvement with this change. We are now avoiding some unnecessary LookupContext population.
Reviewed By: akankshamahajan15
Differential Revision: D44557845
Pulled By: pdillinger
fbshipit-source-id: b841691799d2a48fb59cc8880dc7cbb1e107ae3d
2023-04-07 19:55:56 +00:00
|
|
|
key_context[0].ukey_without_ts = ukeys[0];
|
|
|
|
key_context[0].ikey = encoded_keys[0];
|
2023-12-01 19:15:17 +00:00
|
|
|
key_context[0].get_context = get_contexts.data();
|
Refactor block cache tracing w/improved MultiGet (#11339)
Summary:
After https://github.com/facebook/rocksdb/issues/11301, I wasn't sure whether I had regressed block cache tracing with MultiGet. Demo PR https://github.com/facebook/rocksdb/issues/11330 shows the flawed state of tracing MultiGet before my change, and based on the unit test, there was essentially no change in tracing behavior with https://github.com/facebook/rocksdb/issues/11301. This change is to leave that code and behavior better than I found it.
This change is not intended to change any production behaviors except when block cache tracing is active, though might improve general read path efficiency by disabling some related tracking when such tracing is disabled.
More detail on production code:
* Refactoring to consolidate the construction of BlockCacheTraceRecord, and other related functionality, in block-based table reader, though it's somewhat awkward to preserve an optimization to avoid copying Slices into temporary strings in BlockCacheLookupContext.
* Accurately track cache hits and misses (etc.) for each data block accessed by a MultiGet(). (Previously reported hits as misses.)
* Reduced repeated checking of `block_cache_tracer_` state (by creating lookup_context only when active) for efficiency and to reduce the risk of corner case bugs where tracing is enabled or disabled for different parts of a read op. (See a TODO below)
* Improved estimate calculation for num_keys_in_block (see code comment)
Possible follow-up:
* `XXX:` use_cache=true means double cache query? (possible double-query of block cache when allow_mmap_reads=true)
* `TODO:` need more than one lookup_context here to track individual filter and index partition hits and misses
* `TODO:` optimize more state checks of `block_cache_tracer_` down to `lookup_context != nullptr`
* Pre-existing `XXX:` There appear to be 'break' statements above that bypass this writing of the block cache trace record
* Expand test coverage (see below)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11339
Test Plan:
* Added a basic unit test for block cache tracing MultiGet, for now just covering one data block with two keys.
* Added HitMissCountingCache to independently verify that the actual block cache trace and expected block cache trace also agree with the actual number of cache hits / misses (nothing missing or mislabeled). For now only used with MultiGet test.
* Better testing of num_keys_in_block, for now just with MultiGet
* Misc improvements to table_test to improve clarity, such as making it clear that certain keys are auto-inserted at the start of every test.
Performance test:
Testing multireadrandom as in https://github.com/facebook/rocksdb/issues/11301, except averaging over distinct runs rather than [-X30] which doesn't seem to sufficiently reset after each run to work as an independent test run.
Base with revert of 11301: 3148926 ops/sec
Base: 3019146 ops/sec
New: 2999529 ops/sec
Possibly a tiny MultiGet CPU regression with this change. We are now always allocating an additional vector for the LookupContexts. I'm still contemplating options to try to correct the regression in https://github.com/facebook/rocksdb/issues/11301.
Testing readrandom:
Base with revert of 11301: 2311988
Base: 2281726
New: 2299722
Possibly a tiny Get CPU improvement with this change. We are now avoiding some unnecessary LookupContext population.
Reviewed By: akankshamahajan15
Differential Revision: D44557845
Pulled By: pdillinger
fbshipit-source-id: b841691799d2a48fb59cc8880dc7cbb1e107ae3d
2023-04-07 19:55:56 +00:00
|
|
|
key_context.emplace_back(/*ColumnFamilyHandle omitted*/ nullptr, ukeys[1],
|
|
|
|
&values[1],
|
|
|
|
/*PinnableWideColumns omitted*/ nullptr,
|
|
|
|
/*timestamp omitted*/ nullptr, &statuses[1]);
|
|
|
|
key_context[1].ukey_without_ts = ukeys[1];
|
|
|
|
key_context[1].ikey = encoded_keys[1];
|
|
|
|
key_context[1].get_context = &get_contexts[1];
|
|
|
|
autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE> sorted_keys;
|
|
|
|
sorted_keys.push_back(&key_context[0]);
|
|
|
|
sorted_keys.push_back(&key_context[1]);
|
|
|
|
MultiGetContext m_context(
|
|
|
|
&sorted_keys, 0, sorted_keys.size(), /*SequenceNumber*/ 42, ro,
|
|
|
|
options.env->GetFileSystem().get(), options.statistics.get());
|
|
|
|
MultiGetRange range = m_context.GetMultiGetRange();
|
|
|
|
|
|
|
|
get_perf_context()->Reset();
|
|
|
|
c.GetTableReader()->MultiGet(ro, &range, /*prefix_extractor*/ nullptr);
|
|
|
|
|
|
|
|
// Verify read op result
|
|
|
|
for (uint32_t i = 0; i <= 1; i++) {
|
|
|
|
ASSERT_OK(statuses[i]);
|
|
|
|
ASSERT_EQ(get_contexts[i].State(), GetContext::kFound);
|
|
|
|
ASSERT_EQ(values[i].ToString(), kDummyValue);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Verify traces.
|
|
|
|
BlockCacheTraceRecord record;
|
|
|
|
if (first_pass) {
|
|
|
|
// The first two records should be prefetching index and filter blocks.
|
|
|
|
record.get_id = 0;
|
|
|
|
record.block_type = TraceType::kBlockTraceIndexBlock;
|
|
|
|
record.caller = TableReaderCaller::kPrefetch;
|
|
|
|
record.is_cache_hit = false;
|
|
|
|
record.no_insert = false;
|
|
|
|
expected_records.push_back(record);
|
|
|
|
record.block_type = TraceType::kBlockTraceFilterBlock;
|
|
|
|
expected_records.push_back(record);
|
|
|
|
}
|
2023-12-06 21:48:15 +00:00
|
|
|
// Then we should have three records for one index, one filter, and one
|
|
|
|
// data block access. (The two keys share a data block.)
|
Refactor block cache tracing w/improved MultiGet (#11339)
Summary:
After https://github.com/facebook/rocksdb/issues/11301, I wasn't sure whether I had regressed block cache tracing with MultiGet. Demo PR https://github.com/facebook/rocksdb/issues/11330 shows the flawed state of tracing MultiGet before my change, and based on the unit test, there was essentially no change in tracing behavior with https://github.com/facebook/rocksdb/issues/11301. This change is to leave that code and behavior better than I found it.
This change is not intended to change any production behaviors except when block cache tracing is active, though might improve general read path efficiency by disabling some related tracking when such tracing is disabled.
More detail on production code:
* Refactoring to consolidate the construction of BlockCacheTraceRecord, and other related functionality, in block-based table reader, though it's somewhat awkward to preserve an optimization to avoid copying Slices into temporary strings in BlockCacheLookupContext.
* Accurately track cache hits and misses (etc.) for each data block accessed by a MultiGet(). (Previously reported hits as misses.)
* Reduced repeated checking of `block_cache_tracer_` state (by creating lookup_context only when active) for efficiency and to reduce the risk of corner case bugs where tracing is enabled or disabled for different parts of a read op. (See a TODO below)
* Improved estimate calculation for num_keys_in_block (see code comment)
Possible follow-up:
* `XXX:` use_cache=true means double cache query? (possible double-query of block cache when allow_mmap_reads=true)
* `TODO:` need more than one lookup_context here to track individual filter and index partition hits and misses
* `TODO:` optimize more state checks of `block_cache_tracer_` down to `lookup_context != nullptr`
* Pre-existing `XXX:` There appear to be 'break' statements above that bypass this writing of the block cache trace record
* Expand test coverage (see below)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11339
Test Plan:
* Added a basic unit test for block cache tracing MultiGet, for now just covering one data block with two keys.
* Added HitMissCountingCache to independently verify that the actual block cache trace and expected block cache trace also agree with the actual number of cache hits / misses (nothing missing or mislabeled). For now only used with MultiGet test.
* Better testing of num_keys_in_block, for now just with MultiGet
* Misc improvements to table_test to improve clarity, such as making it clear that certain keys are auto-inserted at the start of every test.
Performance test:
Testing multireadrandom as in https://github.com/facebook/rocksdb/issues/11301, except averaging over distinct runs rather than [-X30] which doesn't seem to sufficiently reset after each run to work as an independent test run.
Base with revert of 11301: 3148926 ops/sec
Base: 3019146 ops/sec
New: 2999529 ops/sec
Possibly a tiny MultiGet CPU regression with this change. We are now always allocating an additional vector for the LookupContexts. I'm still contemplating options to try to correct the regression in https://github.com/facebook/rocksdb/issues/11301.
Testing readrandom:
Base with revert of 11301: 2311988
Base: 2281726
New: 2299722
Possibly a tiny Get CPU improvement with this change. We are now avoiding some unnecessary LookupContext population.
Reviewed By: akankshamahajan15
Differential Revision: D44557845
Pulled By: pdillinger
fbshipit-source-id: b841691799d2a48fb59cc8880dc7cbb1e107ae3d
2023-04-07 19:55:56 +00:00
|
|
|
record.get_id = get_id_offset;
|
|
|
|
record.block_type = TraceType::kBlockTraceFilterBlock;
|
|
|
|
record.caller = TableReaderCaller::kUserMultiGet;
|
|
|
|
record.get_from_user_specified_snapshot = false;
|
|
|
|
record.referenced_key = encoded_keys[0];
|
|
|
|
record.referenced_key_exist_in_block = true;
|
|
|
|
record.is_cache_hit = true;
|
|
|
|
expected_records.push_back(record);
|
|
|
|
record.block_type = TraceType::kBlockTraceIndexBlock;
|
|
|
|
expected_records.push_back(record);
|
|
|
|
record.is_cache_hit = !first_pass;
|
|
|
|
record.block_type = TraceType::kBlockTraceDataBlock;
|
|
|
|
expected_records.push_back(record);
|
|
|
|
}
|
|
|
|
VerifyBlockAccessTrace(&c, expected_records);
|
|
|
|
cache->VerifyExpectedHitMissCounts(expected_records);
|
|
|
|
c.ResetTableReader();
|
|
|
|
}
|
|
|
|
|
2019-07-17 20:02:00 +00:00
|
|
|
TEST_P(BlockBasedTableTest, TracingApproximateOffsetOfTest) {
|
|
|
|
TableConstructor c(BytewiseComparator());
|
|
|
|
Options options;
|
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
|
|
|
options.create_if_missing = true;
|
|
|
|
table_options.block_cache = NewLRUCache(1024 * 1024, 0);
|
|
|
|
table_options.cache_index_and_filter_blocks = true;
|
|
|
|
table_options.filter_policy.reset(NewBloomFilterPolicy(10, true));
|
|
|
|
options.table_factory.reset(new BlockBasedTableFactory(table_options));
|
|
|
|
SetupTracingTest(&c);
|
|
|
|
std::vector<std::string> keys;
|
|
|
|
stl_wrappers::KVMap kvmap;
|
2021-05-05 20:59:21 +00:00
|
|
|
ImmutableOptions ioptions(options);
|
2019-07-17 20:02:00 +00:00
|
|
|
MutableCFOptions moptions(options);
|
|
|
|
c.Finish(options, ioptions, moptions, table_options,
|
|
|
|
GetPlainInternalComparator(options.comparator), &keys, &kvmap);
|
2023-04-21 16:07:18 +00:00
|
|
|
const ReadOptions read_options;
|
2019-07-17 20:02:00 +00:00
|
|
|
for (uint32_t i = 1; i <= 2; i++) {
|
Refactor block cache tracing w/improved MultiGet (#11339)
Summary:
After https://github.com/facebook/rocksdb/issues/11301, I wasn't sure whether I had regressed block cache tracing with MultiGet. Demo PR https://github.com/facebook/rocksdb/issues/11330 shows the flawed state of tracing MultiGet before my change, and based on the unit test, there was essentially no change in tracing behavior with https://github.com/facebook/rocksdb/issues/11301. This change is to leave that code and behavior better than I found it.
This change is not intended to change any production behaviors except when block cache tracing is active, though might improve general read path efficiency by disabling some related tracking when such tracing is disabled.
More detail on production code:
* Refactoring to consolidate the construction of BlockCacheTraceRecord, and other related functionality, in block-based table reader, though it's somewhat awkward to preserve an optimization to avoid copying Slices into temporary strings in BlockCacheLookupContext.
* Accurately track cache hits and misses (etc.) for each data block accessed by a MultiGet(). (Previously reported hits as misses.)
* Reduced repeated checking of `block_cache_tracer_` state (by creating lookup_context only when active) for efficiency and to reduce the risk of corner case bugs where tracing is enabled or disabled for different parts of a read op. (See a TODO below)
* Improved estimate calculation for num_keys_in_block (see code comment)
Possible follow-up:
* `XXX:` use_cache=true means double cache query? (possible double-query of block cache when allow_mmap_reads=true)
* `TODO:` need more than one lookup_context here to track individual filter and index partition hits and misses
* `TODO:` optimize more state checks of `block_cache_tracer_` down to `lookup_context != nullptr`
* Pre-existing `XXX:` There appear to be 'break' statements above that bypass this writing of the block cache trace record
* Expand test coverage (see below)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11339
Test Plan:
* Added a basic unit test for block cache tracing MultiGet, for now just covering one data block with two keys.
* Added HitMissCountingCache to independently verify that the actual block cache trace and expected block cache trace also agree with the actual number of cache hits / misses (nothing missing or mislabeled). For now only used with MultiGet test.
* Better testing of num_keys_in_block, for now just with MultiGet
* Misc improvements to table_test to improve clarity, such as making it clear that certain keys are auto-inserted at the start of every test.
Performance test:
Testing multireadrandom as in https://github.com/facebook/rocksdb/issues/11301, except averaging over distinct runs rather than [-X30] which doesn't seem to sufficiently reset after each run to work as an independent test run.
Base with revert of 11301: 3148926 ops/sec
Base: 3019146 ops/sec
New: 2999529 ops/sec
Possibly a tiny MultiGet CPU regression with this change. We are now always allocating an additional vector for the LookupContexts. I'm still contemplating options to try to correct the regression in https://github.com/facebook/rocksdb/issues/11301.
Testing readrandom:
Base with revert of 11301: 2311988
Base: 2281726
New: 2299722
Possibly a tiny Get CPU improvement with this change. We are now avoiding some unnecessary LookupContext population.
Reviewed By: akankshamahajan15
Differential Revision: D44557845
Pulled By: pdillinger
fbshipit-source-id: b841691799d2a48fb59cc8880dc7cbb1e107ae3d
2023-04-07 19:55:56 +00:00
|
|
|
InternalKey internal_key(auto_add_key1, 0, kTypeValue);
|
2019-07-17 20:02:00 +00:00
|
|
|
std::string encoded_key = internal_key.Encode().ToString();
|
|
|
|
c.GetTableReader()->ApproximateOffsetOf(
|
2023-04-21 16:07:18 +00:00
|
|
|
read_options, encoded_key, TableReaderCaller::kUserApproximateSize);
|
2019-07-17 20:02:00 +00:00
|
|
|
}
|
|
|
|
// Verify traces.
|
|
|
|
std::vector<BlockCacheTraceRecord> expected_records;
|
|
|
|
// The first two records should be prefetching index and filter blocks.
|
|
|
|
BlockCacheTraceRecord record;
|
|
|
|
record.block_type = TraceType::kBlockTraceIndexBlock;
|
|
|
|
record.caller = TableReaderCaller::kPrefetch;
|
2022-10-21 19:15:35 +00:00
|
|
|
record.is_cache_hit = false;
|
|
|
|
record.no_insert = false;
|
2019-07-17 20:02:00 +00:00
|
|
|
expected_records.push_back(record);
|
|
|
|
record.block_type = TraceType::kBlockTraceFilterBlock;
|
|
|
|
expected_records.push_back(record);
|
|
|
|
// Then we should have two records for only index blocks.
|
|
|
|
record.block_type = TraceType::kBlockTraceIndexBlock;
|
|
|
|
record.caller = TableReaderCaller::kUserApproximateSize;
|
2022-10-21 19:15:35 +00:00
|
|
|
record.is_cache_hit = true;
|
2019-07-17 20:02:00 +00:00
|
|
|
expected_records.push_back(record);
|
|
|
|
expected_records.push_back(record);
|
|
|
|
VerifyBlockAccessTrace(&c, expected_records);
|
|
|
|
c.ResetTableReader();
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(BlockBasedTableTest, TracingIterator) {
|
|
|
|
TableConstructor c(BytewiseComparator());
|
|
|
|
Options options;
|
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
|
|
|
options.create_if_missing = true;
|
|
|
|
table_options.block_cache = NewLRUCache(1024 * 1024, 0);
|
|
|
|
table_options.cache_index_and_filter_blocks = true;
|
|
|
|
table_options.filter_policy.reset(NewBloomFilterPolicy(10, true));
|
|
|
|
options.table_factory.reset(new BlockBasedTableFactory(table_options));
|
|
|
|
SetupTracingTest(&c);
|
|
|
|
std::vector<std::string> keys;
|
|
|
|
stl_wrappers::KVMap kvmap;
|
2021-05-05 20:59:21 +00:00
|
|
|
ImmutableOptions ioptions(options);
|
2019-07-17 20:02:00 +00:00
|
|
|
MutableCFOptions moptions(options);
|
|
|
|
c.Finish(options, ioptions, moptions, table_options,
|
|
|
|
GetPlainInternalComparator(options.comparator), &keys, &kvmap);
|
|
|
|
|
|
|
|
for (uint32_t i = 1; i <= 2; i++) {
|
2020-08-03 22:21:56 +00:00
|
|
|
ReadOptions read_options;
|
2019-07-17 20:02:00 +00:00
|
|
|
std::unique_ptr<InternalIterator> iter(c.GetTableReader()->NewIterator(
|
2020-08-03 22:21:56 +00:00
|
|
|
read_options, moptions.prefix_extractor.get(), /*arena=*/nullptr,
|
2019-07-17 20:02:00 +00:00
|
|
|
/*skip_filters=*/false, TableReaderCaller::kUserIterator));
|
|
|
|
iter->SeekToFirst();
|
|
|
|
while (iter->Valid()) {
|
|
|
|
iter->key();
|
|
|
|
iter->value();
|
|
|
|
iter->Next();
|
|
|
|
}
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
iter.reset();
|
|
|
|
}
|
|
|
|
|
|
|
|
// Verify traces.
|
|
|
|
std::vector<BlockCacheTraceRecord> expected_records;
|
|
|
|
// The first two records should be prefetching index and filter blocks.
|
|
|
|
BlockCacheTraceRecord record;
|
|
|
|
record.block_type = TraceType::kBlockTraceIndexBlock;
|
|
|
|
record.caller = TableReaderCaller::kPrefetch;
|
2022-10-21 19:15:35 +00:00
|
|
|
record.is_cache_hit = false;
|
|
|
|
record.no_insert = false;
|
2019-07-17 20:02:00 +00:00
|
|
|
expected_records.push_back(record);
|
|
|
|
record.block_type = TraceType::kBlockTraceFilterBlock;
|
|
|
|
expected_records.push_back(record);
|
|
|
|
// Then we should have three records for index and two data block access.
|
|
|
|
record.block_type = TraceType::kBlockTraceIndexBlock;
|
|
|
|
record.caller = TableReaderCaller::kUserIterator;
|
2022-10-21 19:15:35 +00:00
|
|
|
record.is_cache_hit = true;
|
2019-07-17 20:02:00 +00:00
|
|
|
expected_records.push_back(record);
|
|
|
|
record.block_type = TraceType::kBlockTraceDataBlock;
|
2022-10-21 19:15:35 +00:00
|
|
|
record.is_cache_hit = false;
|
2019-07-17 20:02:00 +00:00
|
|
|
expected_records.push_back(record);
|
|
|
|
expected_records.push_back(record);
|
2023-12-06 21:48:15 +00:00
|
|
|
// When we iterate this file for the second time, we should observe all
|
|
|
|
// cache hits.
|
2019-07-17 20:02:00 +00:00
|
|
|
record.block_type = TraceType::kBlockTraceIndexBlock;
|
2022-10-21 19:15:35 +00:00
|
|
|
record.is_cache_hit = true;
|
2019-07-17 20:02:00 +00:00
|
|
|
expected_records.push_back(record);
|
|
|
|
record.block_type = TraceType::kBlockTraceDataBlock;
|
|
|
|
expected_records.push_back(record);
|
|
|
|
expected_records.push_back(record);
|
|
|
|
VerifyBlockAccessTrace(&c, expected_records);
|
|
|
|
c.ResetTableReader();
|
|
|
|
}
|
|
|
|
|
2014-02-19 23:38:57 +00:00
|
|
|
// A simple tool that takes the snapshot of block cache statistics.
|
|
|
|
class BlockCachePropertiesSnapshot {
|
2013-11-13 06:46:51 +00:00
|
|
|
public:
|
2014-02-19 23:38:57 +00:00
|
|
|
explicit BlockCachePropertiesSnapshot(Statistics* statistics) {
|
2014-01-17 20:46:06 +00:00
|
|
|
block_cache_miss = statistics->getTickerCount(BLOCK_CACHE_MISS);
|
|
|
|
block_cache_hit = statistics->getTickerCount(BLOCK_CACHE_HIT);
|
|
|
|
index_block_cache_miss = statistics->getTickerCount(BLOCK_CACHE_INDEX_MISS);
|
|
|
|
index_block_cache_hit = statistics->getTickerCount(BLOCK_CACHE_INDEX_HIT);
|
|
|
|
data_block_cache_miss = statistics->getTickerCount(BLOCK_CACHE_DATA_MISS);
|
|
|
|
data_block_cache_hit = statistics->getTickerCount(BLOCK_CACHE_DATA_HIT);
|
2014-02-19 23:38:57 +00:00
|
|
|
filter_block_cache_miss =
|
|
|
|
statistics->getTickerCount(BLOCK_CACHE_FILTER_MISS);
|
|
|
|
filter_block_cache_hit = statistics->getTickerCount(BLOCK_CACHE_FILTER_HIT);
|
2015-10-07 22:17:20 +00:00
|
|
|
block_cache_bytes_read = statistics->getTickerCount(BLOCK_CACHE_BYTES_READ);
|
|
|
|
block_cache_bytes_write =
|
|
|
|
statistics->getTickerCount(BLOCK_CACHE_BYTES_WRITE);
|
2014-02-19 23:38:57 +00:00
|
|
|
}
|
|
|
|
|
2014-10-31 18:59:54 +00:00
|
|
|
void AssertIndexBlockStat(int64_t expected_index_block_cache_miss,
|
|
|
|
int64_t expected_index_block_cache_hit) {
|
|
|
|
ASSERT_EQ(expected_index_block_cache_miss, index_block_cache_miss);
|
|
|
|
ASSERT_EQ(expected_index_block_cache_hit, index_block_cache_hit);
|
2014-02-19 23:38:57 +00:00
|
|
|
}
|
|
|
|
|
2014-10-31 18:59:54 +00:00
|
|
|
void AssertFilterBlockStat(int64_t expected_filter_block_cache_miss,
|
|
|
|
int64_t expected_filter_block_cache_hit) {
|
|
|
|
ASSERT_EQ(expected_filter_block_cache_miss, filter_block_cache_miss);
|
|
|
|
ASSERT_EQ(expected_filter_block_cache_hit, filter_block_cache_hit);
|
2013-11-13 06:46:51 +00:00
|
|
|
}
|
|
|
|
|
2013-11-20 00:29:42 +00:00
|
|
|
// Check if the fetched props matches the expected ones.
|
2014-02-19 23:38:57 +00:00
|
|
|
// TODO(kailiu) Use this only when you disabled filter policy!
|
2014-10-31 18:59:54 +00:00
|
|
|
void AssertEqual(int64_t expected_index_block_cache_miss,
|
|
|
|
int64_t expected_index_block_cache_hit,
|
|
|
|
int64_t expected_data_block_cache_miss,
|
|
|
|
int64_t expected_data_block_cache_hit) const {
|
|
|
|
ASSERT_EQ(expected_index_block_cache_miss, index_block_cache_miss);
|
|
|
|
ASSERT_EQ(expected_index_block_cache_hit, index_block_cache_hit);
|
|
|
|
ASSERT_EQ(expected_data_block_cache_miss, data_block_cache_miss);
|
|
|
|
ASSERT_EQ(expected_data_block_cache_hit, data_block_cache_hit);
|
|
|
|
ASSERT_EQ(expected_index_block_cache_miss + expected_data_block_cache_miss,
|
|
|
|
block_cache_miss);
|
|
|
|
ASSERT_EQ(expected_index_block_cache_hit + expected_data_block_cache_hit,
|
|
|
|
block_cache_hit);
|
2013-11-13 06:46:51 +00:00
|
|
|
}
|
|
|
|
|
2015-10-07 22:17:20 +00:00
|
|
|
int64_t GetCacheBytesRead() { return block_cache_bytes_read; }
|
|
|
|
|
|
|
|
int64_t GetCacheBytesWrite() { return block_cache_bytes_write; }
|
|
|
|
|
2013-11-13 06:46:51 +00:00
|
|
|
private:
|
2014-02-05 00:21:47 +00:00
|
|
|
int64_t block_cache_miss = 0;
|
|
|
|
int64_t block_cache_hit = 0;
|
|
|
|
int64_t index_block_cache_miss = 0;
|
|
|
|
int64_t index_block_cache_hit = 0;
|
|
|
|
int64_t data_block_cache_miss = 0;
|
|
|
|
int64_t data_block_cache_hit = 0;
|
2014-02-19 23:38:57 +00:00
|
|
|
int64_t filter_block_cache_miss = 0;
|
|
|
|
int64_t filter_block_cache_hit = 0;
|
2015-10-07 22:17:20 +00:00
|
|
|
int64_t block_cache_bytes_read = 0;
|
|
|
|
int64_t block_cache_bytes_write = 0;
|
2013-11-13 06:46:51 +00:00
|
|
|
};
|
|
|
|
|
2023-12-06 21:48:15 +00:00
|
|
|
// Make sure, by default, index/filter blocks were pre-loaded (meaning we
|
|
|
|
// won't use block cache to store them).
|
2018-06-05 02:59:44 +00:00
|
|
|
TEST_P(BlockBasedTableTest, BlockCacheDisabledTest) {
|
2014-02-19 23:38:57 +00:00
|
|
|
Options options;
|
|
|
|
options.create_if_missing = true;
|
|
|
|
options.statistics = CreateDBStatistics();
|
2018-06-05 02:59:44 +00:00
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
2016-04-07 20:51:47 +00:00
|
|
|
table_options.block_cache = NewLRUCache(1024, 4);
|
2014-08-25 21:22:05 +00:00
|
|
|
table_options.filter_policy.reset(NewBloomFilterPolicy(10));
|
2014-02-19 23:38:57 +00:00
|
|
|
options.table_factory.reset(new BlockBasedTableFactory(table_options));
|
|
|
|
std::vector<std::string> keys;
|
2015-09-02 20:58:22 +00:00
|
|
|
stl_wrappers::KVMap kvmap;
|
2014-02-19 23:38:57 +00:00
|
|
|
|
2016-08-19 22:10:31 +00:00
|
|
|
TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
|
2014-02-19 23:38:57 +00:00
|
|
|
c.Add("key", "value");
|
2021-05-05 20:59:21 +00:00
|
|
|
const ImmutableOptions ioptions(options);
|
2018-05-21 21:33:55 +00:00
|
|
|
const MutableCFOptions moptions(options);
|
|
|
|
c.Finish(options, ioptions, moptions, table_options,
|
2014-08-25 21:22:05 +00:00
|
|
|
GetPlainInternalComparator(options.comparator), &keys, &kvmap);
|
2014-02-19 23:38:57 +00:00
|
|
|
|
|
|
|
// preloading filter/index blocks is enabled.
|
2014-08-25 23:14:30 +00:00
|
|
|
auto reader = dynamic_cast<BlockBasedTable*>(c.GetTableReader());
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
2019-07-16 20:11:23 +00:00
|
|
|
ASSERT_FALSE(reader->TEST_FilterBlockInCache());
|
Move the index readers out of the block cache (#5298)
Summary:
Currently, when the block cache is used for index blocks as well, it is
not really the index block that is stored in the cache but an
IndexReader object. Since this object is not pure data (it has, for
instance, pointers that might dangle), it's not really sharable. To
avoid the issues around this, the current code uses a dummy unique cache
key for each TableReader to store the IndexReader, and erases the
IndexReader entry when the TableReader is closed. Instead of doing this,
the new code moves the IndexReader out of the cache altogether. In
particular, instead of the TableReader owning, or caching/pinning the
IndexReader based on the customer's settings, the TableReader
unconditionally owns the IndexReader, which in turn owns/caches/pins
the index block (which is itself sharable and thus can be safely put in
the cache without any hacks).
Note: the change has two side effects:
1) Partitions of partitioned indexes no longer affect the read
amplification statistics.
2) Eviction statistics for index blocks are temporarily broken. We plan to fix
this in a separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5298
Differential Revision: D15303203
Pulled By: ltamasi
fbshipit-source-id: 935a69ba59d87d5e44f42e2310619b790c366e47
2019-05-30 18:49:36 +00:00
|
|
|
ASSERT_FALSE(reader->TEST_IndexBlockInCache());
|
2014-02-19 23:38:57 +00:00
|
|
|
|
|
|
|
{
|
|
|
|
// nothing happens in the beginning
|
|
|
|
BlockCachePropertiesSnapshot props(options.statistics.get());
|
|
|
|
props.AssertIndexBlockStat(0, 0);
|
|
|
|
props.AssertFilterBlockStat(0, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
{
|
2014-09-29 18:09:09 +00:00
|
|
|
GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
|
2015-03-03 18:59:36 +00:00
|
|
|
GetContext::kNotFound, Slice(), nullptr, nullptr,
|
Add support for wide-column point lookups (#10540)
Summary:
The patch adds a new API `GetEntity` that can be used to perform
wide-column point lookups. It also extends the `Get` code path and
the `MemTable` / `MemTableList` and `Version` / `GetContext` logic
accordingly so that wide-column entities can be served from both
memtables and SSTs. If the result of a lookup is a wide-column entity
(`kTypeWideColumnEntity`), it is passed to the application in deserialized
form; if it is a plain old key-value (`kTypeValue`), it is presented as a
wide-column entity with a single default (anonymous) column.
(In contrast, regular `Get` returns plain old key-values as-is, and
returns the value of the default column for wide-column entities, see
https://github.com/facebook/rocksdb/issues/10483 .)
The result of `GetEntity` is a self-contained `PinnableWideColumns` object.
`PinnableWideColumns` contains a `PinnableSlice`, which either stores the
underlying data in its own buffer or holds on to a cache handle. It also contains
a `WideColumns` instance, which indexes the contents of the `PinnableSlice`,
so applications can access the values of columns efficiently.
There are several pieces of functionality which are currently not supported
for wide-column entities: there is currently no `MultiGetEntity` or wide-column
iterator; also, `Merge` and `GetMergeOperands` are not supported, and there
is no `GetEntity` implementation for read-only and secondary instances.
We plan to implement these in future PRs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10540
Test Plan: `make check`
Reviewed By: akankshamahajan15
Differential Revision: D38847474
Pulled By: ltamasi
fbshipit-source-id: 42311a34ccdfe88b3775e847a5e2a5296e002b5b
2022-08-19 18:51:12 +00:00
|
|
|
nullptr, nullptr, true, nullptr, nullptr);
|
2014-02-19 23:38:57 +00:00
|
|
|
// a hack that just to trigger BlockBasedTable::GetFilter.
|
2020-10-08 18:11:52 +00:00
|
|
|
ASSERT_OK(reader->Get(ReadOptions(), "non-exist-key", &get_context,
|
|
|
|
moptions.prefix_extractor.get()));
|
2014-02-19 23:38:57 +00:00
|
|
|
BlockCachePropertiesSnapshot props(options.statistics.get());
|
|
|
|
props.AssertIndexBlockStat(0, 0);
|
|
|
|
props.AssertFilterBlockStat(0, 0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Due to the difficulities of the intersaction between statistics, this test
|
|
|
|
// only tests the case when "index block is put to block cache"
|
2018-06-05 02:59:44 +00:00
|
|
|
TEST_P(BlockBasedTableTest, FilterBlockInBlockCache) {
|
2013-11-13 06:46:51 +00:00
|
|
|
// -- Table construction
|
2013-11-20 06:00:48 +00:00
|
|
|
Options options;
|
2013-11-13 06:46:51 +00:00
|
|
|
options.create_if_missing = true;
|
2017-05-02 20:39:09 +00:00
|
|
|
options.statistics = CreateDBStatistics();
|
2014-01-24 18:57:15 +00:00
|
|
|
|
|
|
|
// Enable the cache for index/filter blocks
|
2018-06-05 02:59:44 +00:00
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
2019-09-16 22:14:51 +00:00
|
|
|
LRUCacheOptions co;
|
|
|
|
co.capacity = 2048;
|
|
|
|
co.num_shard_bits = 2;
|
|
|
|
co.metadata_charge_policy = kDontChargeCacheMetadata;
|
|
|
|
table_options.block_cache = NewLRUCache(co);
|
2014-01-24 18:57:15 +00:00
|
|
|
table_options.cache_index_and_filter_blocks = true;
|
|
|
|
options.table_factory.reset(new BlockBasedTableFactory(table_options));
|
2013-11-13 06:46:51 +00:00
|
|
|
std::vector<std::string> keys;
|
2015-09-02 20:58:22 +00:00
|
|
|
stl_wrappers::KVMap kvmap;
|
2013-11-13 06:46:51 +00:00
|
|
|
|
2016-08-19 22:10:31 +00:00
|
|
|
TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
|
2013-11-13 06:46:51 +00:00
|
|
|
c.Add("key", "value");
|
2021-05-05 20:59:21 +00:00
|
|
|
const ImmutableOptions ioptions(options);
|
2018-05-21 21:33:55 +00:00
|
|
|
const MutableCFOptions moptions(options);
|
|
|
|
c.Finish(options, ioptions, moptions, table_options,
|
2014-08-25 21:22:05 +00:00
|
|
|
GetPlainInternalComparator(options.comparator), &keys, &kvmap);
|
2014-02-19 23:38:57 +00:00
|
|
|
// preloading filter/index blocks is prohibited.
|
2014-10-22 18:52:35 +00:00
|
|
|
auto* reader = dynamic_cast<BlockBasedTable*>(c.GetTableReader());
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
2019-07-16 20:11:23 +00:00
|
|
|
ASSERT_FALSE(reader->TEST_FilterBlockInCache());
|
Move the index readers out of the block cache (#5298)
Summary:
Currently, when the block cache is used for index blocks as well, it is
not really the index block that is stored in the cache but an
IndexReader object. Since this object is not pure data (it has, for
instance, pointers that might dangle), it's not really sharable. To
avoid the issues around this, the current code uses a dummy unique cache
key for each TableReader to store the IndexReader, and erases the
IndexReader entry when the TableReader is closed. Instead of doing this,
the new code moves the IndexReader out of the cache altogether. In
particular, instead of the TableReader owning, or caching/pinning the
IndexReader based on the customer's settings, the TableReader
unconditionally owns the IndexReader, which in turn owns/caches/pins
the index block (which is itself sharable and thus can be safely put in
the cache without any hacks).
Note: the change has two side effects:
1) Partitions of partitioned indexes no longer affect the read
amplification statistics.
2) Eviction statistics for index blocks are temporarily broken. We plan to fix
this in a separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5298
Differential Revision: D15303203
Pulled By: ltamasi
fbshipit-source-id: 935a69ba59d87d5e44f42e2310619b790c366e47
2019-05-30 18:49:36 +00:00
|
|
|
ASSERT_TRUE(reader->TEST_IndexBlockInCache());
|
2013-11-13 06:46:51 +00:00
|
|
|
|
|
|
|
// -- PART 1: Open with regular block cache.
|
|
|
|
// Since block_cache is disabled, no cache activities will be involved.
|
2018-11-09 19:17:34 +00:00
|
|
|
std::unique_ptr<InternalIterator> iter;
|
2013-11-13 06:46:51 +00:00
|
|
|
|
2015-10-07 22:17:20 +00:00
|
|
|
int64_t last_cache_bytes_read = 0;
|
2013-11-13 06:46:51 +00:00
|
|
|
// At first, no block will be accessed.
|
|
|
|
{
|
2014-02-19 23:38:57 +00:00
|
|
|
BlockCachePropertiesSnapshot props(options.statistics.get());
|
2013-11-13 06:46:51 +00:00
|
|
|
// index will be added to block cache.
|
2014-02-05 00:21:47 +00:00
|
|
|
props.AssertEqual(1, // index block miss
|
|
|
|
0, 0, 0);
|
2015-10-07 22:17:20 +00:00
|
|
|
ASSERT_EQ(props.GetCacheBytesRead(), 0);
|
|
|
|
ASSERT_EQ(props.GetCacheBytesWrite(),
|
2019-09-09 18:22:28 +00:00
|
|
|
static_cast<int64_t>(table_options.block_cache->GetUsage()));
|
2015-10-07 22:17:20 +00:00
|
|
|
last_cache_bytes_read = props.GetCacheBytesRead();
|
2013-11-13 06:46:51 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Only index block will be accessed
|
|
|
|
{
|
2018-05-21 21:33:55 +00:00
|
|
|
iter.reset(c.NewIterator(moptions.prefix_extractor.get()));
|
2014-02-19 23:38:57 +00:00
|
|
|
BlockCachePropertiesSnapshot props(options.statistics.get());
|
2013-11-13 06:46:51 +00:00
|
|
|
// NOTE: to help better highlight the "detla" of each ticker, I use
|
|
|
|
// <last_value> + <added_value> to indicate the increment of changed
|
|
|
|
// value; other numbers remain the same.
|
2014-02-05 00:21:47 +00:00
|
|
|
props.AssertEqual(1, 0 + 1, // index block hit
|
|
|
|
0, 0);
|
2015-10-07 22:17:20 +00:00
|
|
|
// Cache hit, bytes read from cache should increase
|
|
|
|
ASSERT_GT(props.GetCacheBytesRead(), last_cache_bytes_read);
|
|
|
|
ASSERT_EQ(props.GetCacheBytesWrite(),
|
2019-09-09 18:22:28 +00:00
|
|
|
static_cast<int64_t>(table_options.block_cache->GetUsage()));
|
2015-10-07 22:17:20 +00:00
|
|
|
last_cache_bytes_read = props.GetCacheBytesRead();
|
2013-11-13 06:46:51 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Only data block will be accessed
|
|
|
|
{
|
|
|
|
iter->SeekToFirst();
|
2020-10-08 18:11:52 +00:00
|
|
|
ASSERT_OK(iter->status());
|
2014-02-19 23:38:57 +00:00
|
|
|
BlockCachePropertiesSnapshot props(options.statistics.get());
|
2014-02-05 00:21:47 +00:00
|
|
|
props.AssertEqual(1, 1, 0 + 1, // data block miss
|
|
|
|
0);
|
2015-10-07 22:17:20 +00:00
|
|
|
// Cache miss, Bytes read from cache should not change
|
|
|
|
ASSERT_EQ(props.GetCacheBytesRead(), last_cache_bytes_read);
|
|
|
|
ASSERT_EQ(props.GetCacheBytesWrite(),
|
2019-09-09 18:22:28 +00:00
|
|
|
static_cast<int64_t>(table_options.block_cache->GetUsage()));
|
2015-10-07 22:17:20 +00:00
|
|
|
last_cache_bytes_read = props.GetCacheBytesRead();
|
2013-11-13 06:46:51 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Data block will be in cache
|
|
|
|
{
|
2018-05-21 21:33:55 +00:00
|
|
|
iter.reset(c.NewIterator(moptions.prefix_extractor.get()));
|
2013-11-13 06:46:51 +00:00
|
|
|
iter->SeekToFirst();
|
2020-10-08 18:11:52 +00:00
|
|
|
ASSERT_OK(iter->status());
|
2014-02-19 23:38:57 +00:00
|
|
|
BlockCachePropertiesSnapshot props(options.statistics.get());
|
2014-02-05 00:21:47 +00:00
|
|
|
props.AssertEqual(1, 1 + 1, /* index block hit */
|
|
|
|
1, 0 + 1 /* data block hit */);
|
2015-10-07 22:17:20 +00:00
|
|
|
// Cache hit, bytes read from cache should increase
|
|
|
|
ASSERT_GT(props.GetCacheBytesRead(), last_cache_bytes_read);
|
|
|
|
ASSERT_EQ(props.GetCacheBytesWrite(),
|
2019-09-09 18:22:28 +00:00
|
|
|
static_cast<int64_t>(table_options.block_cache->GetUsage()));
|
2013-11-13 06:46:51 +00:00
|
|
|
}
|
|
|
|
// release the iterator so that the block cache can reset correctly.
|
|
|
|
iter.reset();
|
2017-05-02 20:39:09 +00:00
|
|
|
|
Adding pin_l0_filter_and_index_blocks_in_cache feature and related fixes.
Summary:
When a block based table file is opened, if prefetch_index_and_filter is true, it will prefetch the index and filter blocks, putting them into the block cache.
What this feature adds: when a L0 block based table file is opened, if pin_l0_filter_and_index_blocks_in_cache is true in the options (and prefetch_index_and_filter is true), then the filter and index blocks aren't released back to the block cache at the end of BlockBasedTableReader::Open(). Instead the table reader takes ownership of them, hence pinning them, ie. the LRU cache will never push them out. Meanwhile in the table reader, further accesses will not hit the block cache, thus avoiding lock contention.
Test Plan:
'export TEST_TMPDIR=/dev/shm/ && DISABLE_JEMALLOC=1 OPT=-g make all valgrind_check -j32' is OK.
I didn't run the Java tests, I don't have Java set up on my devserver.
Reviewers: sdong
Reviewed By: sdong
Subscribers: andrewkr, dhruba
Differential Revision: https://reviews.facebook.net/D56133
2016-04-01 17:42:39 +00:00
|
|
|
c.ResetTableReader();
|
|
|
|
|
2014-10-22 18:52:35 +00:00
|
|
|
// -- PART 2: Open with very small block cache
|
2013-11-13 06:46:51 +00:00
|
|
|
// In this test, no block will ever get hit since the block cache is
|
|
|
|
// too small to fit even one entry.
|
2016-04-07 20:51:47 +00:00
|
|
|
table_options.block_cache = NewLRUCache(1, 4);
|
2017-05-02 20:39:09 +00:00
|
|
|
options.statistics = CreateDBStatistics();
|
2014-08-25 21:22:05 +00:00
|
|
|
options.table_factory.reset(new BlockBasedTableFactory(table_options));
|
2021-05-05 20:59:21 +00:00
|
|
|
const ImmutableOptions ioptions2(options);
|
2018-05-21 21:33:55 +00:00
|
|
|
const MutableCFOptions moptions2(options);
|
2020-10-08 18:11:52 +00:00
|
|
|
ASSERT_OK(c.Reopen(ioptions2, moptions2));
|
2013-11-13 06:46:51 +00:00
|
|
|
{
|
2014-02-19 23:38:57 +00:00
|
|
|
BlockCachePropertiesSnapshot props(options.statistics.get());
|
2014-02-05 00:21:47 +00:00
|
|
|
props.AssertEqual(1, // index block miss
|
|
|
|
0, 0, 0);
|
2015-10-07 22:17:20 +00:00
|
|
|
// Cache miss, Bytes read from cache should not change
|
|
|
|
ASSERT_EQ(props.GetCacheBytesRead(), 0);
|
2013-11-13 06:46:51 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
// Both index and data block get accessed.
|
|
|
|
// It first cache index block then data block. But since the cache size
|
|
|
|
// is only 1, index block will be purged after data block is inserted.
|
2018-05-21 21:33:55 +00:00
|
|
|
iter.reset(c.NewIterator(moptions2.prefix_extractor.get()));
|
2014-02-19 23:38:57 +00:00
|
|
|
BlockCachePropertiesSnapshot props(options.statistics.get());
|
2014-02-05 00:21:47 +00:00
|
|
|
props.AssertEqual(1 + 1, // index block miss
|
|
|
|
0, 0, // data block miss
|
|
|
|
0);
|
2015-10-07 22:17:20 +00:00
|
|
|
// Cache hit, bytes read from cache should increase
|
|
|
|
ASSERT_EQ(props.GetCacheBytesRead(), 0);
|
2013-11-13 06:46:51 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
// SeekToFirst() accesses data block. With similar reason, we expect data
|
|
|
|
// block's cache miss.
|
|
|
|
iter->SeekToFirst();
|
2020-10-08 18:11:52 +00:00
|
|
|
ASSERT_OK(iter->status());
|
2014-02-19 23:38:57 +00:00
|
|
|
BlockCachePropertiesSnapshot props(options.statistics.get());
|
2014-02-05 00:21:47 +00:00
|
|
|
props.AssertEqual(2, 0, 0 + 1, // data block miss
|
|
|
|
0);
|
2015-10-07 22:17:20 +00:00
|
|
|
// Cache miss, Bytes read from cache should not change
|
|
|
|
ASSERT_EQ(props.GetCacheBytesRead(), 0);
|
2013-11-13 06:46:51 +00:00
|
|
|
}
|
2014-10-22 18:52:35 +00:00
|
|
|
iter.reset();
|
Adding pin_l0_filter_and_index_blocks_in_cache feature and related fixes.
Summary:
When a block based table file is opened, if prefetch_index_and_filter is true, it will prefetch the index and filter blocks, putting them into the block cache.
What this feature adds: when a L0 block based table file is opened, if pin_l0_filter_and_index_blocks_in_cache is true in the options (and prefetch_index_and_filter is true), then the filter and index blocks aren't released back to the block cache at the end of BlockBasedTableReader::Open(). Instead the table reader takes ownership of them, hence pinning them, ie. the LRU cache will never push them out. Meanwhile in the table reader, further accesses will not hit the block cache, thus avoiding lock contention.
Test Plan:
'export TEST_TMPDIR=/dev/shm/ && DISABLE_JEMALLOC=1 OPT=-g make all valgrind_check -j32' is OK.
I didn't run the Java tests, I don't have Java set up on my devserver.
Reviewers: sdong
Reviewed By: sdong
Subscribers: andrewkr, dhruba
Differential Revision: https://reviews.facebook.net/D56133
2016-04-01 17:42:39 +00:00
|
|
|
c.ResetTableReader();
|
2014-10-22 18:52:35 +00:00
|
|
|
|
|
|
|
// -- PART 3: Open table with bloom filter enabled but not in SST file
|
2016-04-07 20:51:47 +00:00
|
|
|
table_options.block_cache = NewLRUCache(4096, 4);
|
2014-10-22 18:52:35 +00:00
|
|
|
table_options.cache_index_and_filter_blocks = false;
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
|
|
|
|
TableConstructor c3(BytewiseComparator());
|
2014-10-22 20:53:35 +00:00
|
|
|
std::string user_key = "k01";
|
|
|
|
InternalKey internal_key(user_key, 0, kTypeValue);
|
|
|
|
c3.Add(internal_key.Encode().ToString(), "hello");
|
2021-05-05 20:59:21 +00:00
|
|
|
ImmutableOptions ioptions3(options);
|
2018-05-21 21:33:55 +00:00
|
|
|
MutableCFOptions moptions3(options);
|
2014-10-22 18:52:35 +00:00
|
|
|
// Generate table without filter policy
|
2018-05-21 21:33:55 +00:00
|
|
|
c3.Finish(options, ioptions3, moptions3, table_options,
|
Adding pin_l0_filter_and_index_blocks_in_cache feature and related fixes.
Summary:
When a block based table file is opened, if prefetch_index_and_filter is true, it will prefetch the index and filter blocks, putting them into the block cache.
What this feature adds: when a L0 block based table file is opened, if pin_l0_filter_and_index_blocks_in_cache is true in the options (and prefetch_index_and_filter is true), then the filter and index blocks aren't released back to the block cache at the end of BlockBasedTableReader::Open(). Instead the table reader takes ownership of them, hence pinning them, ie. the LRU cache will never push them out. Meanwhile in the table reader, further accesses will not hit the block cache, thus avoiding lock contention.
Test Plan:
'export TEST_TMPDIR=/dev/shm/ && DISABLE_JEMALLOC=1 OPT=-g make all valgrind_check -j32' is OK.
I didn't run the Java tests, I don't have Java set up on my devserver.
Reviewers: sdong
Reviewed By: sdong
Subscribers: andrewkr, dhruba
Differential Revision: https://reviews.facebook.net/D56133
2016-04-01 17:42:39 +00:00
|
|
|
GetPlainInternalComparator(options.comparator), &keys, &kvmap);
|
|
|
|
c3.ResetTableReader();
|
|
|
|
|
2014-10-22 18:52:35 +00:00
|
|
|
// Open table with filter policy
|
|
|
|
table_options.filter_policy.reset(NewBloomFilterPolicy(1));
|
|
|
|
options.table_factory.reset(new BlockBasedTableFactory(table_options));
|
2017-05-02 20:39:09 +00:00
|
|
|
options.statistics = CreateDBStatistics();
|
2021-05-05 20:59:21 +00:00
|
|
|
ImmutableOptions ioptions4(options);
|
2018-05-21 21:33:55 +00:00
|
|
|
MutableCFOptions moptions4(options);
|
|
|
|
ASSERT_OK(c3.Reopen(ioptions4, moptions4));
|
2014-10-22 18:52:35 +00:00
|
|
|
reader = dynamic_cast<BlockBasedTable*>(c3.GetTableReader());
|
Move the filter readers out of the block cache (#5504)
Summary:
Currently, when the block cache is used for the filter block, it is not
really the block itself that is stored in the cache but a FilterBlockReader
object. Since this object is not pure data (it has, for instance, pointers that
might dangle, including in one case a back pointer to the TableReader), it's not
really sharable. To avoid the issues around this, the current code erases the
cache entries when the TableReader is closed (which, BTW, is not sufficient
since a concurrent TableReader might have picked up the object in the meantime).
Instead of doing this, the patch moves the FilterBlockReader out of the cache
altogether, and decouples the filter reader object from the filter block.
In particular, instead of the TableReader owning, or caching/pinning the
FilterBlockReader (based on the customer's settings), with the change the
TableReader unconditionally owns the FilterBlockReader, which in turn
owns/caches/pins the filter block. This change also enables us to reuse the code
paths historically used for data blocks for filters as well.
Note:
Eviction statistics for filter blocks are temporarily broken. We plan to fix this in a
separate phase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5504
Test Plan: make asan_check
Differential Revision: D16036974
Pulled By: ltamasi
fbshipit-source-id: 770f543c5fb4ed126fd1e04bfd3809cf4ff9c091
2019-07-16 20:11:23 +00:00
|
|
|
ASSERT_FALSE(reader->TEST_FilterBlockInCache());
|
2017-03-13 18:44:50 +00:00
|
|
|
PinnableSlice value;
|
2014-10-22 18:52:35 +00:00
|
|
|
GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
|
2015-03-03 18:59:36 +00:00
|
|
|
GetContext::kNotFound, user_key, &value, nullptr,
|
Add support for wide-column point lookups (#10540)
Summary:
The patch adds a new API `GetEntity` that can be used to perform
wide-column point lookups. It also extends the `Get` code path and
the `MemTable` / `MemTableList` and `Version` / `GetContext` logic
accordingly so that wide-column entities can be served from both
memtables and SSTs. If the result of a lookup is a wide-column entity
(`kTypeWideColumnEntity`), it is passed to the application in deserialized
form; if it is a plain old key-value (`kTypeValue`), it is presented as a
wide-column entity with a single default (anonymous) column.
(In contrast, regular `Get` returns plain old key-values as-is, and
returns the value of the default column for wide-column entities, see
https://github.com/facebook/rocksdb/issues/10483 .)
The result of `GetEntity` is a self-contained `PinnableWideColumns` object.
`PinnableWideColumns` contains a `PinnableSlice`, which either stores the
underlying data in its own buffer or holds on to a cache handle. It also contains
a `WideColumns` instance, which indexes the contents of the `PinnableSlice`,
so applications can access the values of columns efficiently.
There are several pieces of functionality which are currently not supported
for wide-column entities: there is currently no `MultiGetEntity` or wide-column
iterator; also, `Merge` and `GetMergeOperands` are not supported, and there
is no `GetEntity` implementation for read-only and secondary instances.
We plan to implement these in future PRs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10540
Test Plan: `make check`
Reviewed By: akankshamahajan15
Differential Revision: D38847474
Pulled By: ltamasi
fbshipit-source-id: 42311a34ccdfe88b3775e847a5e2a5296e002b5b
2022-08-19 18:51:12 +00:00
|
|
|
nullptr, nullptr, true, nullptr, nullptr);
|
2018-05-26 01:41:31 +00:00
|
|
|
ASSERT_OK(reader->Get(ReadOptions(), internal_key.Encode(), &get_context,
|
2018-05-21 21:33:55 +00:00
|
|
|
moptions4.prefix_extractor.get()));
|
2017-03-13 18:44:50 +00:00
|
|
|
ASSERT_STREQ(value.data(), "hello");
|
2014-10-22 18:52:35 +00:00
|
|
|
BlockCachePropertiesSnapshot props(options.statistics.get());
|
|
|
|
props.AssertFilterBlockStat(0, 0);
|
Adding pin_l0_filter_and_index_blocks_in_cache feature and related fixes.
Summary:
When a block based table file is opened, if prefetch_index_and_filter is true, it will prefetch the index and filter blocks, putting them into the block cache.
What this feature adds: when a L0 block based table file is opened, if pin_l0_filter_and_index_blocks_in_cache is true in the options (and prefetch_index_and_filter is true), then the filter and index blocks aren't released back to the block cache at the end of BlockBasedTableReader::Open(). Instead the table reader takes ownership of them, hence pinning them, ie. the LRU cache will never push them out. Meanwhile in the table reader, further accesses will not hit the block cache, thus avoiding lock contention.
Test Plan:
'export TEST_TMPDIR=/dev/shm/ && DISABLE_JEMALLOC=1 OPT=-g make all valgrind_check -j32' is OK.
I didn't run the Java tests, I don't have Java set up on my devserver.
Reviewers: sdong
Reviewed By: sdong
Subscribers: andrewkr, dhruba
Differential Revision: https://reviews.facebook.net/D56133
2016-04-01 17:42:39 +00:00
|
|
|
c3.ResetTableReader();
|
2013-11-13 06:46:51 +00:00
|
|
|
}
|
|
|
|
|
2016-01-04 18:51:00 +00:00
|
|
|
void ValidateBlockSizeDeviation(int value, int expected) {
|
|
|
|
BlockBasedTableOptions table_options;
|
|
|
|
table_options.block_size_deviation = value;
|
|
|
|
BlockBasedTableFactory* factory = new BlockBasedTableFactory(table_options);
|
|
|
|
|
|
|
|
const BlockBasedTableOptions* normalized_table_options =
|
2020-09-14 23:59:00 +00:00
|
|
|
factory->GetOptions<BlockBasedTableOptions>();
|
2016-01-04 18:51:00 +00:00
|
|
|
ASSERT_EQ(normalized_table_options->block_size_deviation, expected);
|
|
|
|
|
|
|
|
delete factory;
|
|
|
|
}
|
|
|
|
|
|
|
|
void ValidateBlockRestartInterval(int value, int expected) {
|
|
|
|
BlockBasedTableOptions table_options;
|
|
|
|
table_options.block_restart_interval = value;
|
|
|
|
BlockBasedTableFactory* factory = new BlockBasedTableFactory(table_options);
|
|
|
|
|
|
|
|
const BlockBasedTableOptions* normalized_table_options =
|
2020-09-14 23:59:00 +00:00
|
|
|
factory->GetOptions<BlockBasedTableOptions>();
|
2016-01-04 18:51:00 +00:00
|
|
|
ASSERT_EQ(normalized_table_options->block_restart_interval, expected);
|
|
|
|
|
|
|
|
delete factory;
|
|
|
|
}
|
|
|
|
|
2018-06-05 02:59:44 +00:00
|
|
|
TEST_P(BlockBasedTableTest, InvalidOptions) {
|
2023-12-06 21:48:15 +00:00
|
|
|
// invalid values for block_size_deviation (<0 or >100) are silently set to
|
|
|
|
// 0
|
2016-01-04 18:51:00 +00:00
|
|
|
ValidateBlockSizeDeviation(-10, 0);
|
|
|
|
ValidateBlockSizeDeviation(-1, 0);
|
|
|
|
ValidateBlockSizeDeviation(0, 0);
|
|
|
|
ValidateBlockSizeDeviation(1, 1);
|
|
|
|
ValidateBlockSizeDeviation(99, 99);
|
|
|
|
ValidateBlockSizeDeviation(100, 100);
|
|
|
|
ValidateBlockSizeDeviation(101, 0);
|
|
|
|
ValidateBlockSizeDeviation(1000, 0);
|
|
|
|
|
|
|
|
// invalid values for block_restart_interval (<1) are silently set to 1
|
|
|
|
ValidateBlockRestartInterval(-10, 1);
|
|
|
|
ValidateBlockRestartInterval(-1, 1);
|
|
|
|
ValidateBlockRestartInterval(0, 1);
|
|
|
|
ValidateBlockRestartInterval(1, 1);
|
|
|
|
ValidateBlockRestartInterval(2, 2);
|
|
|
|
ValidateBlockRestartInterval(1000, 1000);
|
|
|
|
}
|
|
|
|
|
2018-06-05 02:59:44 +00:00
|
|
|
TEST_P(BlockBasedTableTest, BlockReadCountTest) {
|
Hide deprecated, inefficient block-based filter from public API (#9535)
Summary:
This change removes the ability to configure the deprecated,
inefficient block-based filter in the public API. Options that would
have enabled it now use "full" (and optionally partitioned) filters.
Existing block-based filters can still be read and used, and a "back
door" way to build them still exists, for testing and in case of trouble.
About the only way this removal would cause an issue for users is if
temporary memory for filter construction greatly increases. In
HISTORY.md we suggest a few possible mitigations: partitioned filters,
smaller SST files, or setting reserve_table_builder_memory=true.
Or users who have customized a FilterPolicy using the
CreateFilter/KeyMayMatch mechanism removed in https://github.com/facebook/rocksdb/issues/9501 will have to upgrade
their code. (It's long past time for people to move to the new
builder/reader customization interface.)
This change also introduces some internal-use-only configuration strings
for testing specific filter implementations while bypassing some
compatibility / intelligence logic. This is intended to hint at a path
toward making FilterPolicy Customizable, but it also gives us a "back
door" way to configure block-based filter.
Aside: updated db_bench so that -readonly implies -use_existing_db
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9535
Test Plan:
Unit tests updated. Specifically,
* BlockBasedTableTest.BlockReadCountTest is tweaked to validate the back
door configuration interface and ignoring of `use_block_based_builder`.
* BlockBasedTableTest.TracingGetTest is migrated from testing
block-based filter access pattern to full filter access patter, by
re-ordering some things.
* Options test (pretty self-explanatory)
Performance test - create with `./db_bench -db=/dev/shm/rocksdb1 -bloom_bits=10 -cache_index_and_filter_blocks=1 -benchmarks=fillrandom -num=10000000 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=0` with and without `-use_block_based_filter`, which creates a DB with 21 SST files in L0. Read with `./db_bench -db=/dev/shm/rocksdb1 -readonly -bloom_bits=10 -cache_index_and_filter_blocks=1 -benchmarks=readrandom -num=10000000 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=0 -duration=30`
Without -use_block_based_filter: readrandom 464 ops/sec, 689280 KB DB
With -use_block_based_filter: readrandom 169 ops/sec, 690996 KB DB
No consistent difference with fillrandom
Reviewed By: jay-zhuang
Differential Revision: D34153871
Pulled By: pdillinger
fbshipit-source-id: 31f4a933c542f8f09aca47fa64aec67832a69738
2022-02-12 15:04:09 +00:00
|
|
|
// bloom_filter_type = 1 -- full filter using use_block_based_builder=false
|
|
|
|
// bloom_filter_type = 2 -- full filter using use_block_based_builder=true
|
|
|
|
// because of API change to hide block-based filter
|
Remove deprecated block-based filter (#10184)
Summary:
In https://github.com/facebook/rocksdb/issues/9535, release 7.0, we hid the old block-based filter from being created using
the public API, because of its inefficiency. Although we normally maintain read compatibility
on old DBs forever, filters are not required for reading a DB, only for optimizing read
performance. Thus, it should be acceptable to remove this code and the substantial
maintenance burden it carries as useful features are developed and validated (such
as user timestamp).
This change completely removes the code for reading and writing the old block-based
filters, net removing about 1370 lines of code no longer needed. Options removed from
testing / benchmarking tools. The prior existence is only evident in a couple of places:
* `CacheEntryRole::kDeprecatedFilterBlock` - We can update this public API enum in
a major release to minimize source code incompatibilities.
* A warning is logged when an old table file is opened that used the old block-based
filter. This is provided as a courtesy, and would be a pain to unit test, so manual testing
should suffice. Unfortunately, sst_dump does not tell you whether a file uses
block-based filter, and the structure of the code makes it very difficult to fix.
* To detect that case, `kObsoleteFilterBlockPrefix` (renamed from `kFilterBlockPrefix`)
for metaindex is maintained (for now).
Other notes:
* In some cases where numbers are associated with filter configurations, we have had to
update the assigned numbers so that they all correspond to something that exists.
* Fixed potential stat counting bug by assuming `filter_checked = false` for cases
like `filter == nullptr` rather than assuming `filter_checked = true`
* Removed obsolete `block_offset` and `prefix_extractor` parameters from several
functions.
* Removed some unnecessary checks `if (!table_prefix_extractor() && !prefix_extractor)`
because the caller guarantees the prefix extractor exists and is compatible
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10184
Test Plan:
tests updated, manually test new warning in LOG using base version to
generate a DB
Reviewed By: riversand963
Differential Revision: D37212647
Pulled By: pdillinger
fbshipit-source-id: 06ee020d8de3b81260ffc36ad0c1202cbf463a80
2022-06-16 22:51:33 +00:00
|
|
|
for (int bloom_filter_type = 1; bloom_filter_type <= 2; ++bloom_filter_type) {
|
2015-09-02 22:36:47 +00:00
|
|
|
for (int index_and_filter_in_cache = 0; index_and_filter_in_cache < 2;
|
|
|
|
++index_and_filter_in_cache) {
|
|
|
|
Options options;
|
|
|
|
options.create_if_missing = true;
|
|
|
|
|
2018-06-05 02:59:44 +00:00
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
2015-09-02 22:36:47 +00:00
|
|
|
table_options.block_cache = NewLRUCache(1, 0);
|
|
|
|
table_options.cache_index_and_filter_blocks = index_and_filter_in_cache;
|
Remove deprecated block-based filter (#10184)
Summary:
In https://github.com/facebook/rocksdb/issues/9535, release 7.0, we hid the old block-based filter from being created using
the public API, because of its inefficiency. Although we normally maintain read compatibility
on old DBs forever, filters are not required for reading a DB, only for optimizing read
performance. Thus, it should be acceptable to remove this code and the substantial
maintenance burden it carries as useful features are developed and validated (such
as user timestamp).
This change completely removes the code for reading and writing the old block-based
filters, net removing about 1370 lines of code no longer needed. Options removed from
testing / benchmarking tools. The prior existence is only evident in a couple of places:
* `CacheEntryRole::kDeprecatedFilterBlock` - We can update this public API enum in
a major release to minimize source code incompatibilities.
* A warning is logged when an old table file is opened that used the old block-based
filter. This is provided as a courtesy, and would be a pain to unit test, so manual testing
should suffice. Unfortunately, sst_dump does not tell you whether a file uses
block-based filter, and the structure of the code makes it very difficult to fix.
* To detect that case, `kObsoleteFilterBlockPrefix` (renamed from `kFilterBlockPrefix`)
for metaindex is maintained (for now).
Other notes:
* In some cases where numbers are associated with filter configurations, we have had to
update the assigned numbers so that they all correspond to something that exists.
* Fixed potential stat counting bug by assuming `filter_checked = false` for cases
like `filter == nullptr` rather than assuming `filter_checked = true`
* Removed obsolete `block_offset` and `prefix_extractor` parameters from several
functions.
* Removed some unnecessary checks `if (!table_prefix_extractor() && !prefix_extractor)`
because the caller guarantees the prefix extractor exists and is compatible
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10184
Test Plan:
tests updated, manually test new warning in LOG using base version to
generate a DB
Reviewed By: riversand963
Differential Revision: D37212647
Pulled By: pdillinger
fbshipit-source-id: 06ee020d8de3b81260ffc36ad0c1202cbf463a80
2022-06-16 22:51:33 +00:00
|
|
|
table_options.filter_policy.reset(
|
|
|
|
NewBloomFilterPolicy(10, bloom_filter_type == 2));
|
2015-09-02 22:36:47 +00:00
|
|
|
options.table_factory.reset(new BlockBasedTableFactory(table_options));
|
|
|
|
std::vector<std::string> keys;
|
2015-09-02 23:05:53 +00:00
|
|
|
stl_wrappers::KVMap kvmap;
|
2015-09-02 22:36:47 +00:00
|
|
|
|
|
|
|
TableConstructor c(BytewiseComparator());
|
|
|
|
std::string user_key = "k04";
|
|
|
|
InternalKey internal_key(user_key, 0, kTypeValue);
|
|
|
|
std::string encoded_key = internal_key.Encode().ToString();
|
|
|
|
c.Add(encoded_key, "hello");
|
2021-05-05 20:59:21 +00:00
|
|
|
ImmutableOptions ioptions(options);
|
2018-05-21 21:33:55 +00:00
|
|
|
MutableCFOptions moptions(options);
|
2015-09-02 22:36:47 +00:00
|
|
|
// Generate table with filter policy
|
2018-05-21 21:33:55 +00:00
|
|
|
c.Finish(options, ioptions, moptions, table_options,
|
2015-09-02 22:36:47 +00:00
|
|
|
GetPlainInternalComparator(options.comparator), &keys, &kvmap);
|
|
|
|
auto reader = c.GetTableReader();
|
2017-03-13 18:44:50 +00:00
|
|
|
PinnableSlice value;
|
2019-07-04 01:45:36 +00:00
|
|
|
{
|
|
|
|
GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
|
|
|
|
GetContext::kNotFound, user_key, &value, nullptr,
|
Add support for wide-column point lookups (#10540)
Summary:
The patch adds a new API `GetEntity` that can be used to perform
wide-column point lookups. It also extends the `Get` code path and
the `MemTable` / `MemTableList` and `Version` / `GetContext` logic
accordingly so that wide-column entities can be served from both
memtables and SSTs. If the result of a lookup is a wide-column entity
(`kTypeWideColumnEntity`), it is passed to the application in deserialized
form; if it is a plain old key-value (`kTypeValue`), it is presented as a
wide-column entity with a single default (anonymous) column.
(In contrast, regular `Get` returns plain old key-values as-is, and
returns the value of the default column for wide-column entities, see
https://github.com/facebook/rocksdb/issues/10483 .)
The result of `GetEntity` is a self-contained `PinnableWideColumns` object.
`PinnableWideColumns` contains a `PinnableSlice`, which either stores the
underlying data in its own buffer or holds on to a cache handle. It also contains
a `WideColumns` instance, which indexes the contents of the `PinnableSlice`,
so applications can access the values of columns efficiently.
There are several pieces of functionality which are currently not supported
for wide-column entities: there is currently no `MultiGetEntity` or wide-column
iterator; also, `Merge` and `GetMergeOperands` are not supported, and there
is no `GetEntity` implementation for read-only and secondary instances.
We plan to implement these in future PRs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10540
Test Plan: `make check`
Reviewed By: akankshamahajan15
Differential Revision: D38847474
Pulled By: ltamasi
fbshipit-source-id: 42311a34ccdfe88b3775e847a5e2a5296e002b5b
2022-08-19 18:51:12 +00:00
|
|
|
nullptr, nullptr, true, nullptr, nullptr);
|
2019-07-04 01:45:36 +00:00
|
|
|
get_perf_context()->Reset();
|
|
|
|
ASSERT_OK(reader->Get(ReadOptions(), encoded_key, &get_context,
|
|
|
|
moptions.prefix_extractor.get()));
|
|
|
|
if (index_and_filter_in_cache) {
|
|
|
|
// data, index and filter block
|
|
|
|
ASSERT_EQ(get_perf_context()->block_read_count, 3);
|
|
|
|
ASSERT_EQ(get_perf_context()->index_block_read_count, 1);
|
|
|
|
ASSERT_EQ(get_perf_context()->filter_block_read_count, 1);
|
|
|
|
} else {
|
|
|
|
// just the data block
|
|
|
|
ASSERT_EQ(get_perf_context()->block_read_count, 1);
|
|
|
|
}
|
|
|
|
ASSERT_EQ(get_context.State(), GetContext::kFound);
|
|
|
|
ASSERT_STREQ(value.data(), "hello");
|
2015-09-02 22:36:47 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Get non-existing key
|
|
|
|
user_key = "does-not-exist";
|
|
|
|
internal_key = InternalKey(user_key, 0, kTypeValue);
|
|
|
|
encoded_key = internal_key.Encode().ToString();
|
|
|
|
|
2017-03-13 18:44:50 +00:00
|
|
|
value.Reset();
|
2019-07-04 01:45:36 +00:00
|
|
|
{
|
|
|
|
GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
|
2015-09-02 22:36:47 +00:00
|
|
|
GetContext::kNotFound, user_key, &value, nullptr,
|
Add support for wide-column point lookups (#10540)
Summary:
The patch adds a new API `GetEntity` that can be used to perform
wide-column point lookups. It also extends the `Get` code path and
the `MemTable` / `MemTableList` and `Version` / `GetContext` logic
accordingly so that wide-column entities can be served from both
memtables and SSTs. If the result of a lookup is a wide-column entity
(`kTypeWideColumnEntity`), it is passed to the application in deserialized
form; if it is a plain old key-value (`kTypeValue`), it is presented as a
wide-column entity with a single default (anonymous) column.
(In contrast, regular `Get` returns plain old key-values as-is, and
returns the value of the default column for wide-column entities, see
https://github.com/facebook/rocksdb/issues/10483 .)
The result of `GetEntity` is a self-contained `PinnableWideColumns` object.
`PinnableWideColumns` contains a `PinnableSlice`, which either stores the
underlying data in its own buffer or holds on to a cache handle. It also contains
a `WideColumns` instance, which indexes the contents of the `PinnableSlice`,
so applications can access the values of columns efficiently.
There are several pieces of functionality which are currently not supported
for wide-column entities: there is currently no `MultiGetEntity` or wide-column
iterator; also, `Merge` and `GetMergeOperands` are not supported, and there
is no `GetEntity` implementation for read-only and secondary instances.
We plan to implement these in future PRs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10540
Test Plan: `make check`
Reviewed By: akankshamahajan15
Differential Revision: D38847474
Pulled By: ltamasi
fbshipit-source-id: 42311a34ccdfe88b3775e847a5e2a5296e002b5b
2022-08-19 18:51:12 +00:00
|
|
|
nullptr, nullptr, true, nullptr, nullptr);
|
2019-07-04 01:45:36 +00:00
|
|
|
get_perf_context()->Reset();
|
|
|
|
ASSERT_OK(reader->Get(ReadOptions(), encoded_key, &get_context,
|
|
|
|
moptions.prefix_extractor.get()));
|
|
|
|
ASSERT_EQ(get_context.State(), GetContext::kNotFound);
|
|
|
|
}
|
2015-09-02 22:36:47 +00:00
|
|
|
|
|
|
|
if (index_and_filter_in_cache) {
|
|
|
|
if (bloom_filter_type == 0) {
|
|
|
|
// with block-based, we read index and then the filter
|
2017-06-03 00:12:39 +00:00
|
|
|
ASSERT_EQ(get_perf_context()->block_read_count, 2);
|
2019-06-19 02:00:03 +00:00
|
|
|
ASSERT_EQ(get_perf_context()->index_block_read_count, 1);
|
|
|
|
ASSERT_EQ(get_perf_context()->filter_block_read_count, 1);
|
2015-09-02 22:36:47 +00:00
|
|
|
} else {
|
|
|
|
// with full-filter, we read filter first and then we stop
|
2017-06-03 00:12:39 +00:00
|
|
|
ASSERT_EQ(get_perf_context()->block_read_count, 1);
|
2019-06-19 02:00:03 +00:00
|
|
|
ASSERT_EQ(get_perf_context()->filter_block_read_count, 1);
|
2015-09-02 22:36:47 +00:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// filter is already in memory and it figures out that the key doesn't
|
|
|
|
// exist
|
2017-06-03 00:12:39 +00:00
|
|
|
ASSERT_EQ(get_perf_context()->block_read_count, 0);
|
2015-09-02 22:36:47 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-06-05 02:59:44 +00:00
|
|
|
TEST_P(BlockBasedTableTest, BlockCacheLeak) {
|
2014-01-24 20:14:08 +00:00
|
|
|
// Check that when we reopen a table we don't lose access to blocks already
|
2023-12-06 21:48:15 +00:00
|
|
|
// in the cache. This test checks whether the Table actually makes use of
|
|
|
|
// the unique ID from the file.
|
2014-01-24 20:14:08 +00:00
|
|
|
|
|
|
|
Options opt;
|
2018-11-09 19:17:34 +00:00
|
|
|
std::unique_ptr<InternalKeyComparator> ikc;
|
2014-01-27 21:53:22 +00:00
|
|
|
ikc.reset(new test::PlainInternalKeyComparator(opt.comparator));
|
2014-01-24 20:14:08 +00:00
|
|
|
opt.compression = kNoCompression;
|
2018-06-05 02:59:44 +00:00
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
2014-08-25 21:22:05 +00:00
|
|
|
table_options.block_size = 1024;
|
|
|
|
// big enough so we don't ever lose cached values.
|
2016-04-07 20:51:47 +00:00
|
|
|
table_options.block_cache = NewLRUCache(16 * 1024 * 1024, 4);
|
2014-08-25 21:22:05 +00:00
|
|
|
opt.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
2014-01-24 20:14:08 +00:00
|
|
|
|
2016-08-19 22:10:31 +00:00
|
|
|
TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
|
2014-01-24 20:14:08 +00:00
|
|
|
c.Add("k01", "hello");
|
|
|
|
c.Add("k02", "hello2");
|
|
|
|
c.Add("k03", std::string(10000, 'x'));
|
|
|
|
c.Add("k04", std::string(200000, 'x'));
|
|
|
|
c.Add("k05", std::string(300000, 'x'));
|
|
|
|
c.Add("k06", "hello3");
|
|
|
|
c.Add("k07", std::string(100000, 'x'));
|
|
|
|
std::vector<std::string> keys;
|
2015-09-02 20:58:22 +00:00
|
|
|
stl_wrappers::KVMap kvmap;
|
2021-05-05 20:59:21 +00:00
|
|
|
const ImmutableOptions ioptions(opt);
|
2018-05-21 21:33:55 +00:00
|
|
|
const MutableCFOptions moptions(opt);
|
|
|
|
c.Finish(opt, ioptions, moptions, table_options, *ikc, &keys, &kvmap);
|
2014-01-24 20:14:08 +00:00
|
|
|
|
2018-11-09 19:17:34 +00:00
|
|
|
std::unique_ptr<InternalIterator> iter(
|
2018-05-21 21:33:55 +00:00
|
|
|
c.NewIterator(moptions.prefix_extractor.get()));
|
2014-01-24 20:14:08 +00:00
|
|
|
iter->SeekToFirst();
|
|
|
|
while (iter->Valid()) {
|
|
|
|
iter->key();
|
|
|
|
iter->value();
|
|
|
|
iter->Next();
|
|
|
|
}
|
|
|
|
ASSERT_OK(iter->status());
|
Change and clarify the relationship between Valid(), status() and Seek*() for all iterators. Also fix some bugs
Summary:
Before this PR, Iterator/InternalIterator may simultaneously have non-ok status() and Valid() = true. That state means that the last operation failed, but the iterator is nevertheless positioned on some unspecified record. Likely intended uses of that are:
* If some sst files are corrupted, a normal iterator can be used to read the data from files that are not corrupted.
* When using read_tier = kBlockCacheTier, read the data that's in block cache, skipping over the data that is not.
However, this behavior wasn't documented well (and until recently the wiki on github had misleading incorrect information). In the code there's a lot of confusion about the relationship between status() and Valid(), and about whether Seek()/SeekToLast()/etc reset the status or not. There were a number of bugs caused by this confusion, both inside rocksdb and in the code that uses rocksdb (including ours).
This PR changes the convention to:
* If status() is not ok, Valid() always returns false.
* Any seek operation resets status. (Before the PR, it depended on iterator type and on particular error.)
This does sacrifice the two use cases listed above, but siying said it's ok.
Overview of the changes:
* A commit that adds missing status checks in MergingIterator. This fixes a bug that actually affects us, and we need it fixed. `DBIteratorTest.NonBlockingIterationBugRepro` explains the scenario.
* Changes to lots of iterator types to make all of them conform to the new convention. Some bug fixes along the way. By far the biggest changes are in DBIter, which is a big messy piece of code; I tried to make it less big and messy but mostly failed.
* A stress-test for DBIter, to gain some confidence that I didn't break it. It does a few million random operations on the iterator, while occasionally modifying the underlying data (like ForwardIterator does) and occasionally returning non-ok status from internal iterator.
To find the iterator types that needed changes I searched for "public .*Iterator" in the code. Here's an overview of all 27 iterator types:
Iterators that didn't need changes:
* status() is always ok(), or Valid() is always false: MemTableIterator, ModelIter, TestIterator, KVIter (2 classes with this name anonymous namespaces), LoggingForwardVectorIterator, VectorIterator, MockTableIterator, EmptyIterator, EmptyInternalIterator.
* Thin wrappers that always pass through Valid() and status(): ArenaWrappedDBIter, TtlIterator, InternalIteratorFromIterator.
Iterators with changes (see inline comments for details):
* DBIter - an overhaul:
- It used to silently skip corrupted keys (`FindParseableKey()`), which seems dangerous. This PR makes it just stop immediately after encountering a corrupted key, just like it would for other kinds of corruption. Let me know if there was actually some deeper meaning in this behavior and I should put it back.
- It had a few code paths silently discarding subiterator's status. The stress test caught a few.
- The backwards iteration code path was expecting the internal iterator's set of keys to be immutable. It's probably always true in practice at the moment, since ForwardIterator doesn't support backwards iteration, but this PR fixes it anyway. See added DBIteratorTest.ReverseToForwardBug for an example.
- Some parts of backwards iteration code path even did things like `assert(iter_->Valid())` after a seek, which is never a safe assumption.
- It used to not reset status on seek for some types of errors.
- Some simplifications and better comments.
- Some things got more complicated from the added error handling. I'm open to ideas for how to make it nicer.
* MergingIterator - check status after every operation on every subiterator, and in some places assert that valid subiterators have ok status.
* ForwardIterator - changed to the new convention, also slightly simplified.
* ForwardLevelIterator - fixed some bugs and simplified.
* LevelIterator - simplified.
* TwoLevelIterator - changed to the new convention. Also fixed a bug that would make SeekForPrev() sometimes silently ignore errors from first_level_iter_.
* BlockBasedTableIterator - minor changes.
* BlockIter - replaced `SetStatus()` with `Invalidate()` to make sure non-ok BlockIter is always invalid.
* PlainTableIterator - some seeks used to not reset status.
* CuckooTableIterator - tiny code cleanup.
* ManagedIterator - fixed some bugs.
* BaseDeltaIterator - changed to the new convention and fixed a bug.
* BlobDBIterator - seeks used to not reset status.
* KeyConvertingIterator - some small change.
Closes https://github.com/facebook/rocksdb/pull/3810
Differential Revision: D7888019
Pulled By: al13n321
fbshipit-source-id: 4aaf6d3421c545d16722a815b2fa2e7912bc851d
2018-05-17 09:44:14 +00:00
|
|
|
iter.reset();
|
2014-01-24 20:14:08 +00:00
|
|
|
|
2021-05-05 20:59:21 +00:00
|
|
|
const ImmutableOptions ioptions1(opt);
|
2018-05-21 21:33:55 +00:00
|
|
|
const MutableCFOptions moptions1(opt);
|
|
|
|
ASSERT_OK(c.Reopen(ioptions1, moptions1));
|
2014-08-25 23:14:30 +00:00
|
|
|
auto table_reader = dynamic_cast<BlockBasedTable*>(c.GetTableReader());
|
2014-01-25 05:10:19 +00:00
|
|
|
for (const std::string& key : keys) {
|
2018-05-26 01:41:31 +00:00
|
|
|
InternalKey ikey(key, kMaxSequenceNumber, kTypeValue);
|
|
|
|
ASSERT_TRUE(table_reader->TEST_KeyInCache(ReadOptions(), ikey.Encode()));
|
2014-01-24 20:14:08 +00:00
|
|
|
}
|
Adding pin_l0_filter_and_index_blocks_in_cache feature and related fixes.
Summary:
When a block based table file is opened, if prefetch_index_and_filter is true, it will prefetch the index and filter blocks, putting them into the block cache.
What this feature adds: when a L0 block based table file is opened, if pin_l0_filter_and_index_blocks_in_cache is true in the options (and prefetch_index_and_filter is true), then the filter and index blocks aren't released back to the block cache at the end of BlockBasedTableReader::Open(). Instead the table reader takes ownership of them, hence pinning them, ie. the LRU cache will never push them out. Meanwhile in the table reader, further accesses will not hit the block cache, thus avoiding lock contention.
Test Plan:
'export TEST_TMPDIR=/dev/shm/ && DISABLE_JEMALLOC=1 OPT=-g make all valgrind_check -j32' is OK.
I didn't run the Java tests, I don't have Java set up on my devserver.
Reviewers: sdong
Reviewed By: sdong
Subscribers: andrewkr, dhruba
Differential Revision: https://reviews.facebook.net/D56133
2016-04-01 17:42:39 +00:00
|
|
|
c.ResetTableReader();
|
2014-06-20 08:23:02 +00:00
|
|
|
|
|
|
|
// rerun with different block cache
|
2016-04-07 20:51:47 +00:00
|
|
|
table_options.block_cache = NewLRUCache(16 * 1024 * 1024, 4);
|
2014-08-25 21:22:05 +00:00
|
|
|
opt.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
2021-05-05 20:59:21 +00:00
|
|
|
const ImmutableOptions ioptions2(opt);
|
2018-05-21 21:33:55 +00:00
|
|
|
const MutableCFOptions moptions2(opt);
|
|
|
|
ASSERT_OK(c.Reopen(ioptions2, moptions2));
|
2014-08-25 23:14:30 +00:00
|
|
|
table_reader = dynamic_cast<BlockBasedTable*>(c.GetTableReader());
|
2014-06-20 08:23:02 +00:00
|
|
|
for (const std::string& key : keys) {
|
2018-05-26 01:41:31 +00:00
|
|
|
InternalKey ikey(key, kMaxSequenceNumber, kTypeValue);
|
|
|
|
ASSERT_TRUE(!table_reader->TEST_KeyInCache(ReadOptions(), ikey.Encode()));
|
2014-06-20 08:23:02 +00:00
|
|
|
}
|
Adding pin_l0_filter_and_index_blocks_in_cache feature and related fixes.
Summary:
When a block based table file is opened, if prefetch_index_and_filter is true, it will prefetch the index and filter blocks, putting them into the block cache.
What this feature adds: when a L0 block based table file is opened, if pin_l0_filter_and_index_blocks_in_cache is true in the options (and prefetch_index_and_filter is true), then the filter and index blocks aren't released back to the block cache at the end of BlockBasedTableReader::Open(). Instead the table reader takes ownership of them, hence pinning them, ie. the LRU cache will never push them out. Meanwhile in the table reader, further accesses will not hit the block cache, thus avoiding lock contention.
Test Plan:
'export TEST_TMPDIR=/dev/shm/ && DISABLE_JEMALLOC=1 OPT=-g make all valgrind_check -j32' is OK.
I didn't run the Java tests, I don't have Java set up on my devserver.
Reviewers: sdong
Reviewed By: sdong
Subscribers: andrewkr, dhruba
Differential Revision: https://reviews.facebook.net/D56133
2016-04-01 17:42:39 +00:00
|
|
|
c.ResetTableReader();
|
2014-01-24 20:14:08 +00:00
|
|
|
}
|
|
|
|
|
2018-10-26 21:27:09 +00:00
|
|
|
TEST_P(BlockBasedTableTest, MemoryAllocator) {
|
2021-12-17 12:19:34 +00:00
|
|
|
auto default_memory_allocator = std::make_shared<DefaultMemoryAllocator>();
|
|
|
|
auto custom_memory_allocator =
|
|
|
|
std::make_shared<CountedMemoryAllocator>(default_memory_allocator);
|
2018-10-03 00:21:54 +00:00
|
|
|
{
|
|
|
|
Options opt;
|
2018-11-09 19:17:34 +00:00
|
|
|
std::unique_ptr<InternalKeyComparator> ikc;
|
2018-10-03 00:21:54 +00:00
|
|
|
ikc.reset(new test::PlainInternalKeyComparator(opt.comparator));
|
|
|
|
opt.compression = kNoCompression;
|
|
|
|
BlockBasedTableOptions table_options;
|
|
|
|
table_options.block_size = 1024;
|
|
|
|
LRUCacheOptions lruOptions;
|
2018-11-21 19:28:02 +00:00
|
|
|
lruOptions.memory_allocator = custom_memory_allocator;
|
2018-10-03 00:21:54 +00:00
|
|
|
lruOptions.capacity = 16 * 1024 * 1024;
|
|
|
|
lruOptions.num_shard_bits = 4;
|
|
|
|
table_options.block_cache = NewLRUCache(std::move(lruOptions));
|
|
|
|
opt.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
|
|
|
|
TableConstructor c(BytewiseComparator(),
|
|
|
|
true /* convert_to_internal_key_ */);
|
|
|
|
c.Add("k01", "hello");
|
|
|
|
c.Add("k02", "hello2");
|
|
|
|
c.Add("k03", std::string(10000, 'x'));
|
|
|
|
c.Add("k04", std::string(200000, 'x'));
|
|
|
|
c.Add("k05", std::string(300000, 'x'));
|
|
|
|
c.Add("k06", "hello3");
|
|
|
|
c.Add("k07", std::string(100000, 'x'));
|
|
|
|
std::vector<std::string> keys;
|
|
|
|
stl_wrappers::KVMap kvmap;
|
2021-05-05 20:59:21 +00:00
|
|
|
const ImmutableOptions ioptions(opt);
|
2018-10-03 00:21:54 +00:00
|
|
|
const MutableCFOptions moptions(opt);
|
|
|
|
c.Finish(opt, ioptions, moptions, table_options, *ikc, &keys, &kvmap);
|
|
|
|
|
2018-11-09 19:17:34 +00:00
|
|
|
std::unique_ptr<InternalIterator> iter(
|
2018-10-03 00:21:54 +00:00
|
|
|
c.NewIterator(moptions.prefix_extractor.get()));
|
|
|
|
iter->SeekToFirst();
|
|
|
|
while (iter->Valid()) {
|
|
|
|
iter->key();
|
|
|
|
iter->value();
|
|
|
|
iter->Next();
|
|
|
|
}
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
}
|
|
|
|
|
|
|
|
// out of scope, block cache should have been deleted, all allocations
|
|
|
|
// deallocated
|
2021-12-17 12:19:34 +00:00
|
|
|
EXPECT_EQ(custom_memory_allocator->GetNumAllocations(),
|
|
|
|
custom_memory_allocator->GetNumDeallocations());
|
2018-10-03 00:21:54 +00:00
|
|
|
// make sure that allocations actually happened through the cache allocator
|
2021-12-17 12:19:34 +00:00
|
|
|
EXPECT_GT(custom_memory_allocator->GetNumAllocations(), 0);
|
2018-10-03 00:21:54 +00:00
|
|
|
}
|
|
|
|
|
2020-02-10 23:42:46 +00:00
|
|
|
// Test the file checksum of block based table
|
|
|
|
TEST_P(BlockBasedTableTest, NoFileChecksum) {
|
|
|
|
Options options;
|
2021-05-05 20:59:21 +00:00
|
|
|
ImmutableOptions ioptions(options);
|
2020-02-10 23:42:46 +00:00
|
|
|
MutableCFOptions moptions(options);
|
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
|
|
|
std::unique_ptr<InternalKeyComparator> comparator(
|
|
|
|
new InternalKeyComparator(BytewiseComparator()));
|
|
|
|
int level = 0;
|
2024-02-02 22:14:43 +00:00
|
|
|
InternalTblPropCollFactories internal_tbl_prop_coll_factories;
|
2020-02-10 23:42:46 +00:00
|
|
|
std::string column_family_name;
|
|
|
|
|
|
|
|
FileChecksumTestHelper f(true);
|
2022-06-22 22:45:21 +00:00
|
|
|
f.CreateWritableFile();
|
2020-02-10 23:42:46 +00:00
|
|
|
std::unique_ptr<TableBuilder> builder;
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
const ReadOptions read_options;
|
|
|
|
const WriteOptions write_options;
|
2024-10-17 21:13:20 +00:00
|
|
|
builder.reset(moptions.table_factory->NewTableBuilder(
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
TableBuilderOptions(ioptions, moptions, read_options, write_options,
|
2024-02-02 22:14:43 +00:00
|
|
|
*comparator, &internal_tbl_prop_coll_factories,
|
2021-03-25 21:58:23 +00:00
|
|
|
options.compression, options.compression_opts,
|
2024-11-01 17:08:35 +00:00
|
|
|
kUnknownColumnFamily, column_family_name, level,
|
|
|
|
kUnknownNewestKeyTime),
|
2020-02-10 23:42:46 +00:00
|
|
|
f.GetFileWriter()));
|
2020-10-08 18:11:52 +00:00
|
|
|
ASSERT_OK(f.ResetTableBuilder(std::move(builder)));
|
2020-02-10 23:42:46 +00:00
|
|
|
f.AddKVtoKVMap(1000);
|
2020-12-24 00:54:05 +00:00
|
|
|
ASSERT_OK(f.WriteKVAndFlushTable());
|
2020-06-08 04:54:54 +00:00
|
|
|
ASSERT_STREQ(f.GetFileChecksumFuncName(), kUnknownFileChecksumFuncName);
|
|
|
|
ASSERT_STREQ(f.GetFileChecksum().c_str(), kUnknownFileChecksum);
|
2020-02-10 23:42:46 +00:00
|
|
|
}
|
|
|
|
|
2020-05-21 15:10:43 +00:00
|
|
|
TEST_P(BlockBasedTableTest, Crc32cFileChecksum) {
|
2020-03-29 22:57:02 +00:00
|
|
|
FileChecksumGenCrc32cFactory* file_checksum_gen_factory =
|
|
|
|
new FileChecksumGenCrc32cFactory();
|
2020-02-10 23:42:46 +00:00
|
|
|
Options options;
|
2020-03-29 22:57:02 +00:00
|
|
|
options.file_checksum_gen_factory.reset(file_checksum_gen_factory);
|
2021-05-05 20:59:21 +00:00
|
|
|
ImmutableOptions ioptions(options);
|
2020-02-10 23:42:46 +00:00
|
|
|
MutableCFOptions moptions(options);
|
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
|
|
|
std::unique_ptr<InternalKeyComparator> comparator(
|
|
|
|
new InternalKeyComparator(BytewiseComparator()));
|
|
|
|
int level = 0;
|
2024-02-02 22:14:43 +00:00
|
|
|
InternalTblPropCollFactories internal_tbl_prop_coll_factories;
|
2020-02-10 23:42:46 +00:00
|
|
|
std::string column_family_name;
|
|
|
|
|
2020-03-29 22:57:02 +00:00
|
|
|
FileChecksumGenContext gen_context;
|
|
|
|
gen_context.file_name = "db/tmp";
|
2020-05-21 15:10:43 +00:00
|
|
|
std::unique_ptr<FileChecksumGenerator> checksum_crc32c_gen1 =
|
2020-03-29 22:57:02 +00:00
|
|
|
options.file_checksum_gen_factory->CreateFileChecksumGenerator(
|
|
|
|
gen_context);
|
2020-02-10 23:42:46 +00:00
|
|
|
FileChecksumTestHelper f(true);
|
2022-06-22 22:45:21 +00:00
|
|
|
f.CreateWritableFile();
|
2020-05-21 15:10:43 +00:00
|
|
|
f.SetFileChecksumGenerator(checksum_crc32c_gen1.release());
|
2020-02-10 23:42:46 +00:00
|
|
|
std::unique_ptr<TableBuilder> builder;
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
const ReadOptions read_options;
|
|
|
|
const WriteOptions write_options;
|
2024-10-17 21:13:20 +00:00
|
|
|
builder.reset(moptions.table_factory->NewTableBuilder(
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
TableBuilderOptions(ioptions, moptions, read_options, write_options,
|
2024-02-02 22:14:43 +00:00
|
|
|
*comparator, &internal_tbl_prop_coll_factories,
|
2021-03-25 21:58:23 +00:00
|
|
|
options.compression, options.compression_opts,
|
2024-11-01 17:08:35 +00:00
|
|
|
kUnknownColumnFamily, column_family_name, level,
|
|
|
|
kUnknownNewestKeyTime),
|
2020-02-10 23:42:46 +00:00
|
|
|
f.GetFileWriter()));
|
2020-10-08 18:11:52 +00:00
|
|
|
ASSERT_OK(f.ResetTableBuilder(std::move(builder)));
|
2020-02-10 23:42:46 +00:00
|
|
|
f.AddKVtoKVMap(1000);
|
2020-12-24 00:54:05 +00:00
|
|
|
ASSERT_OK(f.WriteKVAndFlushTable());
|
2020-02-10 23:42:46 +00:00
|
|
|
ASSERT_STREQ(f.GetFileChecksumFuncName(), "FileChecksumCrc32c");
|
2020-03-29 22:57:02 +00:00
|
|
|
|
2020-05-21 15:10:43 +00:00
|
|
|
std::unique_ptr<FileChecksumGenerator> checksum_crc32c_gen2 =
|
2020-03-29 22:57:02 +00:00
|
|
|
options.file_checksum_gen_factory->CreateFileChecksumGenerator(
|
|
|
|
gen_context);
|
2020-02-10 23:42:46 +00:00
|
|
|
std::string checksum;
|
2020-05-21 15:10:43 +00:00
|
|
|
ASSERT_OK(f.CalculateFileChecksum(checksum_crc32c_gen2.get(), &checksum));
|
2020-02-10 23:42:46 +00:00
|
|
|
ASSERT_STREQ(f.GetFileChecksum().c_str(), checksum.c_str());
|
2020-05-21 15:10:43 +00:00
|
|
|
|
|
|
|
// Unit test the generator itself for schema stability
|
|
|
|
std::unique_ptr<FileChecksumGenerator> checksum_crc32c_gen3 =
|
|
|
|
options.file_checksum_gen_factory->CreateFileChecksumGenerator(
|
|
|
|
gen_context);
|
|
|
|
const char data[] = "here is some data";
|
|
|
|
checksum_crc32c_gen3->Update(data, sizeof(data));
|
|
|
|
checksum_crc32c_gen3->Finalize();
|
|
|
|
checksum = checksum_crc32c_gen3->GetChecksum();
|
|
|
|
ASSERT_STREQ(checksum.c_str(), "\345\245\277\110");
|
2020-02-10 23:42:46 +00:00
|
|
|
}
|
|
|
|
|
2015-03-17 21:08:00 +00:00
|
|
|
TEST_F(PlainTableTest, BasicPlainTableProperties) {
|
2014-07-18 07:08:38 +00:00
|
|
|
PlainTableOptions plain_table_options;
|
|
|
|
plain_table_options.user_key_len = 8;
|
|
|
|
plain_table_options.bloom_bits_per_key = 8;
|
|
|
|
plain_table_options.hash_table_ratio = 0;
|
|
|
|
|
|
|
|
PlainTableFactory factory(plain_table_options);
|
2021-01-04 23:59:52 +00:00
|
|
|
std::unique_ptr<FSWritableFile> sink(new test::StringSink());
|
|
|
|
std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
|
|
|
|
std::move(sink), "" /* don't care */, FileOptions()));
|
2014-01-27 21:53:22 +00:00
|
|
|
Options options;
|
2021-05-05 20:59:21 +00:00
|
|
|
const ImmutableOptions ioptions(options);
|
2018-05-21 21:33:55 +00:00
|
|
|
const MutableCFOptions moptions(options);
|
2014-01-27 21:53:22 +00:00
|
|
|
InternalKeyComparator ikc(options.comparator);
|
2024-02-02 22:14:43 +00:00
|
|
|
InternalTblPropCollFactories internal_tbl_prop_coll_factories;
|
2016-04-07 06:10:32 +00:00
|
|
|
std::string column_family_name;
|
2016-09-18 05:30:43 +00:00
|
|
|
int unknown_level = -1;
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
const ReadOptions read_options;
|
|
|
|
const WriteOptions write_options;
|
A new call back to TablePropertiesCollector to allow users know the entry is add, delete or merge
Summary:
Currently users have no idea a key is add, delete or merge from TablePropertiesCollector call back. Add a new function to add it.
Also refactor the codes so that
(1) make table property collector and internal table property collector two separate data structures with the later one now exposed
(2) table builders only receive internal table properties
Test Plan: Add cases in table_properties_collector_test to cover both of old and new ways of using TablePropertiesCollector.
Reviewers: yhchiang, igor.sugak, rven, igor
Reviewed By: rven, igor
Subscribers: meyering, yoshinorim, maykov, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D35373
2015-04-06 17:04:30 +00:00
|
|
|
std::unique_ptr<TableBuilder> builder(factory.NewTableBuilder(
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
TableBuilderOptions(ioptions, moptions, read_options, write_options, ikc,
|
2024-02-02 22:14:43 +00:00
|
|
|
&internal_tbl_prop_coll_factories, kNoCompression,
|
Add more LSM info to FilterBuildingContext (#8246)
Summary:
Add `num_levels`, `is_bottommost`, and table file creation
`reason` to `FilterBuildingContext`, in anticipation of more powerful
Bloom-like filter support.
To support this, added `is_bottommost` and `reason` to
`TableBuilderOptions`, which allowed removing `reason` parameter from
`rocksdb::BuildTable`.
I attempted to remove `skip_filters` from `TableBuilderOptions`, because
filter construction decisions should arise from options, not one-off
parameters. I could not completely remove it because the public API for
SstFileWriter takes a `skip_filters` parameter, and translating this
into an option change would mean awkwardly replacing the table_factory
if it is BlockBasedTableFactory with new filter_policy=nullptr option.
I marked this public skip_filters option as deprecated because of this
oddity. (skip_filters on the read side probably makes sense.)
At least `skip_filters` is now largely hidden for users of
`TableBuilderOptions` and is no longer used for implementing the
optimize_filters_for_hits option. Bringing the logic for that option
closer to handling of FilterBuildingContext makes it more obvious that
hese two are using the same notion of "bottommost." (Planned:
configuration options for Bloom-like filters that generalize
`optimize_filters_for_hits`)
Recommended follow-up: Try to get away from "bottommost level" naming of
things, which is inaccurate (see
VersionStorageInfo::RangeMightExistAfterSortedRun), and move to
"bottommost run" or just "bottommost."
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8246
Test Plan:
extended an existing unit test to exercise and check various
filter building contexts. Also, existing tests for
optimize_filters_for_hits validate some of the "bottommost" handling,
which is now closely connected to FilterBuildingContext::is_bottommost
through TableBuilderOptions::is_bottommost
Reviewed By: mrambacher
Differential Revision: D28099346
Pulled By: pdillinger
fbshipit-source-id: 2c1072e29c24d4ac404c761a7b7663292372600a
2021-04-30 20:49:24 +00:00
|
|
|
CompressionOptions(), kUnknownColumnFamily,
|
2024-11-01 17:08:35 +00:00
|
|
|
column_family_name, unknown_level,
|
|
|
|
kUnknownNewestKeyTime),
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-17 23:16:11 +00:00
|
|
|
file_writer.get()));
|
2014-01-24 20:14:08 +00:00
|
|
|
|
|
|
|
for (char c = 'a'; c <= 'z'; ++c) {
|
2014-01-27 21:53:22 +00:00
|
|
|
std::string key(8, c);
|
|
|
|
key.append("\1 "); // PlainTable expects internal key structure
|
2014-01-24 20:14:08 +00:00
|
|
|
std::string value(28, c + 42);
|
|
|
|
builder->Add(key, value);
|
|
|
|
}
|
|
|
|
ASSERT_OK(builder->Finish());
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
ASSERT_OK(file_writer->Flush(IOOptions()));
|
2014-01-24 20:14:08 +00:00
|
|
|
|
2015-08-05 14:33:27 +00:00
|
|
|
test::StringSink* ss =
|
2021-01-04 23:59:52 +00:00
|
|
|
static_cast<test::StringSink*>(file_writer->writable_file());
|
|
|
|
std::unique_ptr<FSRandomAccessFile> source(
|
|
|
|
new test::StringSource(ss->contents(), 72242, true));
|
2018-11-09 19:17:34 +00:00
|
|
|
std::unique_ptr<RandomAccessFileReader> file_reader(
|
2021-01-04 23:59:52 +00:00
|
|
|
new RandomAccessFileReader(std::move(source), "test"));
|
2014-01-24 20:14:08 +00:00
|
|
|
|
Improve / clean up meta block code & integrity (#9163)
Summary:
* Checksums are now checked on meta blocks unless specifically
suppressed or not applicable (e.g. plain table). (Was other way around.)
This means a number of cases that were not checking checksums now are,
including direct read TableProperties in Version::GetTableProperties
(fixed in meta_blocks ReadTableProperties), reading any block from
PersistentCache (fixed in BlockFetcher), read TableProperties in
SstFileDumper (ldb/sst_dump/BackupEngine) before table reader open,
maybe more.
* For that to work, I moved the global_seqno+TableProperties checksum
logic to the shared table/ code, because that is used by many utilies
such as SstFileDumper.
* Also for that to work, we have to know when we're dealing with a block
that has a checksum (trailer), so added that capability to Footer based
on magic number, and from there BlockFetcher.
* Knowledge of trailer presence has also fixed a problem where other
table formats were reading blocks including bytes for a non-existant
trailer--and awkwardly kind-of not using them, e.g. no shared code
checking checksums. (BlockFetcher compression type was populated
incorrectly.) Now we only read what is needed.
* Minimized code duplication and differing/incompatible/awkward
abstractions in meta_blocks.{cc,h} (e.g. SeekTo in metaindex block
without parsing block handle)
* Moved some meta block handling code from table_properties*.*
* Moved some code specific to block-based table from shared table/ code
to BlockBasedTable class. The checksum stuff means we can't completely
separate it, but things that don't need to be in shared table/ code
should not be.
* Use unique_ptr rather than raw ptr in more places. (Note: you can
std::move from unique_ptr to shared_ptr.)
Without enhancements to GetPropertiesOfAllTablesTest (see below),
net reduction of roughly 100 lines of code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9163
Test Plan:
existing tests and
* Enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to verify that
checksums are now checked on direct read of table properties by TableCache
(new test would fail before this change)
* Also enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to test
putting table properties under old meta name
* Also generally enhanced that same test to actually test what it was
supposed to be testing already, by kicking things out of table cache when
we don't want them there.
Reviewed By: ajkr, mrambacher
Differential Revision: D32514757
Pulled By: pdillinger
fbshipit-source-id: 507964b9311d186ae8d1131182290cbd97a99fa9
2021-11-18 19:42:12 +00:00
|
|
|
std::unique_ptr<TableProperties> props;
|
Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future.
Test Plan: Run all existing unit tests.
Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D42321
2015-07-17 23:16:11 +00:00
|
|
|
auto s = ReadTableProperties(file_reader.get(), ss->contents().size(),
|
2023-04-21 16:07:18 +00:00
|
|
|
kPlainTableMagicNumber, ioptions, read_options,
|
|
|
|
&props);
|
2014-01-24 20:14:08 +00:00
|
|
|
ASSERT_OK(s);
|
|
|
|
|
2014-02-08 03:26:49 +00:00
|
|
|
ASSERT_EQ(0ul, props->index_size);
|
|
|
|
ASSERT_EQ(0ul, props->filter_size);
|
|
|
|
ASSERT_EQ(16ul * 26, props->raw_key_size);
|
|
|
|
ASSERT_EQ(28ul * 26, props->raw_value_size);
|
|
|
|
ASSERT_EQ(26ul, props->num_entries);
|
|
|
|
ASSERT_EQ(1ul, props->num_data_blocks);
|
2014-01-24 20:14:08 +00:00
|
|
|
}
|
2020-02-10 23:42:46 +00:00
|
|
|
|
|
|
|
TEST_F(PlainTableTest, NoFileChecksum) {
|
|
|
|
PlainTableOptions plain_table_options;
|
|
|
|
plain_table_options.user_key_len = 20;
|
|
|
|
plain_table_options.bloom_bits_per_key = 8;
|
|
|
|
plain_table_options.hash_table_ratio = 0;
|
|
|
|
PlainTableFactory factory(plain_table_options);
|
|
|
|
|
|
|
|
Options options;
|
2021-05-05 20:59:21 +00:00
|
|
|
const ImmutableOptions ioptions(options);
|
2020-02-10 23:42:46 +00:00
|
|
|
const MutableCFOptions moptions(options);
|
|
|
|
InternalKeyComparator ikc(options.comparator);
|
2024-02-02 22:14:43 +00:00
|
|
|
InternalTblPropCollFactories internal_tbl_prop_coll_factories;
|
2020-02-10 23:42:46 +00:00
|
|
|
std::string column_family_name;
|
|
|
|
int unknown_level = -1;
|
|
|
|
FileChecksumTestHelper f(true);
|
2022-06-22 22:45:21 +00:00
|
|
|
f.CreateWritableFile();
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
const ReadOptions read_options;
|
|
|
|
const WriteOptions write_options;
|
2020-02-10 23:42:46 +00:00
|
|
|
std::unique_ptr<TableBuilder> builder(factory.NewTableBuilder(
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
TableBuilderOptions(ioptions, moptions, read_options, write_options, ikc,
|
2024-02-02 22:14:43 +00:00
|
|
|
&internal_tbl_prop_coll_factories, kNoCompression,
|
Add more LSM info to FilterBuildingContext (#8246)
Summary:
Add `num_levels`, `is_bottommost`, and table file creation
`reason` to `FilterBuildingContext`, in anticipation of more powerful
Bloom-like filter support.
To support this, added `is_bottommost` and `reason` to
`TableBuilderOptions`, which allowed removing `reason` parameter from
`rocksdb::BuildTable`.
I attempted to remove `skip_filters` from `TableBuilderOptions`, because
filter construction decisions should arise from options, not one-off
parameters. I could not completely remove it because the public API for
SstFileWriter takes a `skip_filters` parameter, and translating this
into an option change would mean awkwardly replacing the table_factory
if it is BlockBasedTableFactory with new filter_policy=nullptr option.
I marked this public skip_filters option as deprecated because of this
oddity. (skip_filters on the read side probably makes sense.)
At least `skip_filters` is now largely hidden for users of
`TableBuilderOptions` and is no longer used for implementing the
optimize_filters_for_hits option. Bringing the logic for that option
closer to handling of FilterBuildingContext makes it more obvious that
hese two are using the same notion of "bottommost." (Planned:
configuration options for Bloom-like filters that generalize
`optimize_filters_for_hits`)
Recommended follow-up: Try to get away from "bottommost level" naming of
things, which is inaccurate (see
VersionStorageInfo::RangeMightExistAfterSortedRun), and move to
"bottommost run" or just "bottommost."
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8246
Test Plan:
extended an existing unit test to exercise and check various
filter building contexts. Also, existing tests for
optimize_filters_for_hits validate some of the "bottommost" handling,
which is now closely connected to FilterBuildingContext::is_bottommost
through TableBuilderOptions::is_bottommost
Reviewed By: mrambacher
Differential Revision: D28099346
Pulled By: pdillinger
fbshipit-source-id: 2c1072e29c24d4ac404c761a7b7663292372600a
2021-04-30 20:49:24 +00:00
|
|
|
CompressionOptions(), kUnknownColumnFamily,
|
2024-11-01 17:08:35 +00:00
|
|
|
column_family_name, unknown_level,
|
|
|
|
kUnknownNewestKeyTime),
|
2020-02-10 23:42:46 +00:00
|
|
|
f.GetFileWriter()));
|
2020-10-08 18:11:52 +00:00
|
|
|
ASSERT_OK(f.ResetTableBuilder(std::move(builder)));
|
2020-02-10 23:42:46 +00:00
|
|
|
f.AddKVtoKVMap(1000);
|
2020-12-24 00:54:05 +00:00
|
|
|
ASSERT_OK(f.WriteKVAndFlushTable());
|
2020-06-08 04:54:54 +00:00
|
|
|
ASSERT_STREQ(f.GetFileChecksumFuncName(), kUnknownFileChecksumFuncName);
|
|
|
|
EXPECT_EQ(f.GetFileChecksum(), kUnknownFileChecksum);
|
2020-02-10 23:42:46 +00:00
|
|
|
}
|
|
|
|
|
2020-05-21 15:10:43 +00:00
|
|
|
TEST_F(PlainTableTest, Crc32cFileChecksum) {
|
2020-02-10 23:42:46 +00:00
|
|
|
PlainTableOptions plain_table_options;
|
|
|
|
plain_table_options.user_key_len = 20;
|
|
|
|
plain_table_options.bloom_bits_per_key = 8;
|
|
|
|
plain_table_options.hash_table_ratio = 0;
|
|
|
|
PlainTableFactory factory(plain_table_options);
|
|
|
|
|
2020-03-29 22:57:02 +00:00
|
|
|
FileChecksumGenCrc32cFactory* file_checksum_gen_factory =
|
|
|
|
new FileChecksumGenCrc32cFactory();
|
2020-02-10 23:42:46 +00:00
|
|
|
Options options;
|
2020-03-29 22:57:02 +00:00
|
|
|
options.file_checksum_gen_factory.reset(file_checksum_gen_factory);
|
2021-05-05 20:59:21 +00:00
|
|
|
const ImmutableOptions ioptions(options);
|
2020-02-10 23:42:46 +00:00
|
|
|
const MutableCFOptions moptions(options);
|
|
|
|
InternalKeyComparator ikc(options.comparator);
|
2024-02-02 22:14:43 +00:00
|
|
|
InternalTblPropCollFactories internal_tbl_prop_coll_factories;
|
2020-02-10 23:42:46 +00:00
|
|
|
std::string column_family_name;
|
|
|
|
int unknown_level = -1;
|
2020-03-29 22:57:02 +00:00
|
|
|
|
|
|
|
FileChecksumGenContext gen_context;
|
|
|
|
gen_context.file_name = "db/tmp";
|
2020-05-21 15:10:43 +00:00
|
|
|
std::unique_ptr<FileChecksumGenerator> checksum_crc32c_gen1 =
|
2020-03-29 22:57:02 +00:00
|
|
|
options.file_checksum_gen_factory->CreateFileChecksumGenerator(
|
|
|
|
gen_context);
|
2020-02-10 23:42:46 +00:00
|
|
|
FileChecksumTestHelper f(true);
|
2022-06-22 22:45:21 +00:00
|
|
|
f.CreateWritableFile();
|
2020-05-21 15:10:43 +00:00
|
|
|
f.SetFileChecksumGenerator(checksum_crc32c_gen1.release());
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
const ReadOptions read_options;
|
|
|
|
const WriteOptions write_options;
|
2020-02-10 23:42:46 +00:00
|
|
|
std::unique_ptr<TableBuilder> builder(factory.NewTableBuilder(
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
TableBuilderOptions(ioptions, moptions, read_options, write_options, ikc,
|
2024-02-02 22:14:43 +00:00
|
|
|
&internal_tbl_prop_coll_factories, kNoCompression,
|
Add more LSM info to FilterBuildingContext (#8246)
Summary:
Add `num_levels`, `is_bottommost`, and table file creation
`reason` to `FilterBuildingContext`, in anticipation of more powerful
Bloom-like filter support.
To support this, added `is_bottommost` and `reason` to
`TableBuilderOptions`, which allowed removing `reason` parameter from
`rocksdb::BuildTable`.
I attempted to remove `skip_filters` from `TableBuilderOptions`, because
filter construction decisions should arise from options, not one-off
parameters. I could not completely remove it because the public API for
SstFileWriter takes a `skip_filters` parameter, and translating this
into an option change would mean awkwardly replacing the table_factory
if it is BlockBasedTableFactory with new filter_policy=nullptr option.
I marked this public skip_filters option as deprecated because of this
oddity. (skip_filters on the read side probably makes sense.)
At least `skip_filters` is now largely hidden for users of
`TableBuilderOptions` and is no longer used for implementing the
optimize_filters_for_hits option. Bringing the logic for that option
closer to handling of FilterBuildingContext makes it more obvious that
hese two are using the same notion of "bottommost." (Planned:
configuration options for Bloom-like filters that generalize
`optimize_filters_for_hits`)
Recommended follow-up: Try to get away from "bottommost level" naming of
things, which is inaccurate (see
VersionStorageInfo::RangeMightExistAfterSortedRun), and move to
"bottommost run" or just "bottommost."
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8246
Test Plan:
extended an existing unit test to exercise and check various
filter building contexts. Also, existing tests for
optimize_filters_for_hits validate some of the "bottommost" handling,
which is now closely connected to FilterBuildingContext::is_bottommost
through TableBuilderOptions::is_bottommost
Reviewed By: mrambacher
Differential Revision: D28099346
Pulled By: pdillinger
fbshipit-source-id: 2c1072e29c24d4ac404c761a7b7663292372600a
2021-04-30 20:49:24 +00:00
|
|
|
CompressionOptions(), kUnknownColumnFamily,
|
2024-11-01 17:08:35 +00:00
|
|
|
column_family_name, unknown_level,
|
|
|
|
kUnknownNewestKeyTime),
|
2020-02-10 23:42:46 +00:00
|
|
|
f.GetFileWriter()));
|
2020-10-08 18:11:52 +00:00
|
|
|
ASSERT_OK(f.ResetTableBuilder(std::move(builder)));
|
2020-02-10 23:42:46 +00:00
|
|
|
f.AddKVtoKVMap(1000);
|
2020-12-24 00:54:05 +00:00
|
|
|
ASSERT_OK(f.WriteKVAndFlushTable());
|
2020-02-10 23:42:46 +00:00
|
|
|
ASSERT_STREQ(f.GetFileChecksumFuncName(), "FileChecksumCrc32c");
|
2020-03-29 22:57:02 +00:00
|
|
|
|
2020-05-21 15:10:43 +00:00
|
|
|
std::unique_ptr<FileChecksumGenerator> checksum_crc32c_gen2 =
|
2020-03-29 22:57:02 +00:00
|
|
|
options.file_checksum_gen_factory->CreateFileChecksumGenerator(
|
|
|
|
gen_context);
|
2020-02-10 23:42:46 +00:00
|
|
|
std::string checksum;
|
2020-05-21 15:10:43 +00:00
|
|
|
ASSERT_OK(f.CalculateFileChecksum(checksum_crc32c_gen2.get(), &checksum));
|
2020-02-10 23:42:46 +00:00
|
|
|
EXPECT_STREQ(f.GetFileChecksum().c_str(), checksum.c_str());
|
|
|
|
}
|
|
|
|
|
2015-03-17 21:08:00 +00:00
|
|
|
TEST_F(GeneralTableTest, ApproximateOffsetOfPlain) {
|
2016-08-19 22:10:31 +00:00
|
|
|
TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
|
2011-03-18 22:37:00 +00:00
|
|
|
c.Add("k01", "hello");
|
|
|
|
c.Add("k02", "hello2");
|
|
|
|
c.Add("k03", std::string(10000, 'x'));
|
|
|
|
c.Add("k04", std::string(200000, 'x'));
|
|
|
|
c.Add("k05", std::string(300000, 'x'));
|
|
|
|
c.Add("k06", "hello3");
|
|
|
|
c.Add("k07", std::string(100000, 'x'));
|
|
|
|
std::vector<std::string> keys;
|
2015-09-02 20:58:22 +00:00
|
|
|
stl_wrappers::KVMap kvmap;
|
2013-11-20 06:00:48 +00:00
|
|
|
Options options;
|
2020-10-19 18:37:05 +00:00
|
|
|
options.db_host_id = "";
|
2014-01-27 21:53:22 +00:00
|
|
|
test::PlainInternalKeyComparator internal_comparator(options.comparator);
|
2011-03-18 22:37:00 +00:00
|
|
|
options.compression = kNoCompression;
|
2014-08-25 21:22:05 +00:00
|
|
|
BlockBasedTableOptions table_options;
|
|
|
|
table_options.block_size = 1024;
|
2021-05-05 20:59:21 +00:00
|
|
|
const ImmutableOptions ioptions(options);
|
2018-05-21 21:33:55 +00:00
|
|
|
const MutableCFOptions moptions(options);
|
|
|
|
c.Finish(options, ioptions, moptions, table_options, internal_comparator,
|
2014-09-04 23:18:36 +00:00
|
|
|
&keys, &kvmap);
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2022-10-25 18:50:38 +00:00
|
|
|
ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, 0));
|
|
|
|
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, 0));
|
|
|
|
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01a"), 0, 0));
|
|
|
|
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"), 0, 0));
|
|
|
|
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), 0, 0));
|
|
|
|
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), 10000, 11000));
|
2016-08-19 22:10:31 +00:00
|
|
|
// k04 and k05 will be in two consecutive blocks, the index is
|
|
|
|
// an arbitrary slice between k04 and k05, either before or after k04a
|
|
|
|
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04a"), 10000, 211000));
|
2022-10-25 18:50:38 +00:00
|
|
|
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k05"), 210000, 211000));
|
|
|
|
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k06"), 510000, 511000));
|
|
|
|
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k07"), 510000, 511000));
|
|
|
|
ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 610000, 612000));
|
Adding pin_l0_filter_and_index_blocks_in_cache feature and related fixes.
Summary:
When a block based table file is opened, if prefetch_index_and_filter is true, it will prefetch the index and filter blocks, putting them into the block cache.
What this feature adds: when a L0 block based table file is opened, if pin_l0_filter_and_index_blocks_in_cache is true in the options (and prefetch_index_and_filter is true), then the filter and index blocks aren't released back to the block cache at the end of BlockBasedTableReader::Open(). Instead the table reader takes ownership of them, hence pinning them, ie. the LRU cache will never push them out. Meanwhile in the table reader, further accesses will not hit the block cache, thus avoiding lock contention.
Test Plan:
'export TEST_TMPDIR=/dev/shm/ && DISABLE_JEMALLOC=1 OPT=-g make all valgrind_check -j32' is OK.
I didn't run the Java tests, I don't have Java set up on my devserver.
Reviewers: sdong
Reviewed By: sdong
Subscribers: andrewkr, dhruba
Differential Revision: https://reviews.facebook.net/D56133
2016-04-01 17:42:39 +00:00
|
|
|
c.ResetTableReader();
|
2011-03-18 22:37:00 +00:00
|
|
|
}
|
|
|
|
|
2014-01-24 19:09:04 +00:00
|
|
|
static void DoCompressionTest(CompressionType comp) {
|
2023-04-17 21:17:18 +00:00
|
|
|
SCOPED_TRACE("CompressionType = " + CompressionTypeToString(comp));
|
2011-03-18 22:37:00 +00:00
|
|
|
Random rnd(301);
|
2016-08-19 22:10:31 +00:00
|
|
|
TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
|
2011-03-18 22:37:00 +00:00
|
|
|
std::string tmp;
|
|
|
|
c.Add("k01", "hello");
|
|
|
|
c.Add("k02", test::CompressibleString(&rnd, 0.25, 10000, &tmp));
|
|
|
|
c.Add("k03", "hello3");
|
|
|
|
c.Add("k04", test::CompressibleString(&rnd, 0.25, 10000, &tmp));
|
|
|
|
std::vector<std::string> keys;
|
2015-09-02 20:58:22 +00:00
|
|
|
stl_wrappers::KVMap kvmap;
|
2013-11-20 06:00:48 +00:00
|
|
|
Options options;
|
2014-01-27 21:53:22 +00:00
|
|
|
test::PlainInternalKeyComparator ikc(options.comparator);
|
2012-06-28 06:41:33 +00:00
|
|
|
options.compression = comp;
|
2024-02-15 19:23:48 +00:00
|
|
|
options.db_host_id = "";
|
2014-08-25 21:22:05 +00:00
|
|
|
BlockBasedTableOptions table_options;
|
|
|
|
table_options.block_size = 1024;
|
2024-02-15 19:23:48 +00:00
|
|
|
options.table_factory.reset(new BlockBasedTableFactory(table_options));
|
2021-05-05 20:59:21 +00:00
|
|
|
const ImmutableOptions ioptions(options);
|
2018-05-21 21:33:55 +00:00
|
|
|
const MutableCFOptions moptions(options);
|
|
|
|
c.Finish(options, ioptions, moptions, table_options, ikc, &keys, &kvmap);
|
2011-03-18 22:37:00 +00:00
|
|
|
|
For ApproximateSizes, pro-rate table metadata size over data blocks (#6784)
Summary:
The implementation of GetApproximateSizes was inconsistent in
its treatment of the size of non-data blocks of SST files, sometimes
including and sometimes now. This was at its worst with large portion
of table file used by filters and querying a small range that crossed
a table boundary: the size estimate would include large filter size.
It's conceivable that someone might want only to know the size in terms
of data blocks, but I believe that's unlikely enough to ignore for now.
Similarly, there's no evidence the internal function AppoximateOffsetOf
is used for anything other than a one-sided ApproximateSize, so I intend
to refactor to remove redundancy in a follow-up commit.
So to fix this, GetApproximateSizes (and implementation details
ApproximateSize and ApproximateOffsetOf) now consistently include in
their returned sizes a portion of table file metadata (incl filters
and indexes) based on the size portion of the data blocks in range. In
other words, if a key range covers data blocks that are X% by size of all
the table's data blocks, returned approximate size is X% of the total
file size. It would technically be more accurate to attribute metadata
based on number of keys, but that's not computationally efficient with
data available and rarely a meaningful difference.
Also includes miscellaneous comment improvements / clarifications.
Also included is a new approximatesizerandom benchmark for db_bench.
No significant performance difference seen with this change, whether ~700 ops/sec with cache_index_and_filter_blocks and small cache or ~150k ops/sec without cache_index_and_filter_blocks.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6784
Test Plan:
Test added to DBTest.ApproximateSizesFilesWithErrorMargin.
Old code running new test...
[ RUN ] DBTest.ApproximateSizesFilesWithErrorMargin
db/db_test.cc:1562: Failure
Expected: (size) <= (11 * 100), actual: 9478 vs 1100
Other tests updated to reflect consistent accounting of metadata.
Reviewed By: siying
Differential Revision: D21334706
Pulled By: pdillinger
fbshipit-source-id: 6f86870e45213334fedbe9c73b4ebb1d8d611185
2020-06-02 19:27:59 +00:00
|
|
|
ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, 0));
|
|
|
|
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, 0));
|
|
|
|
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"), 0, 0));
|
2024-11-01 17:08:35 +00:00
|
|
|
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), 2000, 3555));
|
|
|
|
ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), 2000, 3555));
|
|
|
|
ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 4000, 7110));
|
Adding pin_l0_filter_and_index_blocks_in_cache feature and related fixes.
Summary:
When a block based table file is opened, if prefetch_index_and_filter is true, it will prefetch the index and filter blocks, putting them into the block cache.
What this feature adds: when a L0 block based table file is opened, if pin_l0_filter_and_index_blocks_in_cache is true in the options (and prefetch_index_and_filter is true), then the filter and index blocks aren't released back to the block cache at the end of BlockBasedTableReader::Open(). Instead the table reader takes ownership of them, hence pinning them, ie. the LRU cache will never push them out. Meanwhile in the table reader, further accesses will not hit the block cache, thus avoiding lock contention.
Test Plan:
'export TEST_TMPDIR=/dev/shm/ && DISABLE_JEMALLOC=1 OPT=-g make all valgrind_check -j32' is OK.
I didn't run the Java tests, I don't have Java set up on my devserver.
Reviewers: sdong
Reviewed By: sdong
Subscribers: andrewkr, dhruba
Differential Revision: https://reviews.facebook.net/D56133
2016-04-01 17:42:39 +00:00
|
|
|
c.ResetTableReader();
|
2011-03-18 22:37:00 +00:00
|
|
|
}
|
|
|
|
|
2015-03-17 21:08:00 +00:00
|
|
|
TEST_F(GeneralTableTest, ApproximateOffsetOfCompressed) {
|
2014-02-11 01:02:02 +00:00
|
|
|
std::vector<CompressionType> compression_state;
|
2015-04-06 19:50:44 +00:00
|
|
|
if (!Snappy_Supported()) {
|
2012-06-28 06:41:33 +00:00
|
|
|
fprintf(stderr, "skipping snappy compression tests\n");
|
|
|
|
} else {
|
2014-02-11 01:02:02 +00:00
|
|
|
compression_state.push_back(kSnappyCompression);
|
2012-06-28 06:41:33 +00:00
|
|
|
}
|
|
|
|
|
2015-04-06 19:50:44 +00:00
|
|
|
if (!Zlib_Supported()) {
|
2012-06-28 06:41:33 +00:00
|
|
|
fprintf(stderr, "skipping zlib compression tests\n");
|
|
|
|
} else {
|
2014-02-11 01:02:02 +00:00
|
|
|
compression_state.push_back(kZlibCompression);
|
2012-06-28 06:41:33 +00:00
|
|
|
}
|
|
|
|
|
2014-02-11 01:02:02 +00:00
|
|
|
// TODO(kailiu) DoCompressionTest() doesn't work with BZip2.
|
|
|
|
/*
|
2015-04-06 19:50:44 +00:00
|
|
|
if (!BZip2_Supported()) {
|
2014-02-08 02:12:30 +00:00
|
|
|
fprintf(stderr, "skipping bzip2 compression tests\n");
|
|
|
|
} else {
|
2014-02-11 01:02:02 +00:00
|
|
|
compression_state.push_back(kBZip2Compression);
|
2014-02-08 02:12:30 +00:00
|
|
|
}
|
2014-02-11 01:02:02 +00:00
|
|
|
*/
|
2014-02-08 02:12:30 +00:00
|
|
|
|
2015-04-06 19:50:44 +00:00
|
|
|
if (!LZ4_Supported()) {
|
|
|
|
fprintf(stderr, "skipping lz4 and lz4hc compression tests\n");
|
2014-02-08 02:12:30 +00:00
|
|
|
} else {
|
2014-02-11 01:02:02 +00:00
|
|
|
compression_state.push_back(kLZ4Compression);
|
|
|
|
compression_state.push_back(kLZ4HCCompression);
|
2012-06-28 06:41:33 +00:00
|
|
|
}
|
|
|
|
|
2016-04-20 05:54:24 +00:00
|
|
|
if (!XPRESS_Supported()) {
|
|
|
|
fprintf(stderr, "skipping xpress and xpress compression tests\n");
|
2022-10-25 18:50:38 +00:00
|
|
|
} else {
|
2016-04-20 05:54:24 +00:00
|
|
|
compression_state.push_back(kXpressCompression);
|
|
|
|
}
|
|
|
|
|
2014-02-11 01:02:02 +00:00
|
|
|
for (auto state : compression_state) {
|
|
|
|
DoCompressionTest(state);
|
2012-06-28 06:41:33 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-07-24 00:38:49 +00:00
|
|
|
TEST_F(GeneralTableTest, ApproximateKeyAnchors) {
|
|
|
|
Random rnd(301);
|
|
|
|
TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
|
|
|
|
std::string tmp;
|
|
|
|
for (int i = 1000; i < 9000; i++) {
|
|
|
|
c.Add(std::to_string(i), rnd.RandomString(2000));
|
|
|
|
}
|
|
|
|
std::vector<std::string> keys;
|
|
|
|
stl_wrappers::KVMap kvmap;
|
|
|
|
Options options;
|
|
|
|
InternalKeyComparator ikc(options.comparator);
|
|
|
|
options.compression = kNoCompression;
|
|
|
|
BlockBasedTableOptions table_options;
|
|
|
|
table_options.block_size = 4096;
|
|
|
|
const ImmutableOptions ioptions(options);
|
|
|
|
const MutableCFOptions moptions(options);
|
|
|
|
c.Finish(options, ioptions, moptions, table_options, ikc, &keys, &kvmap);
|
|
|
|
|
|
|
|
std::vector<TableReader::Anchor> anchors;
|
|
|
|
ASSERT_OK(c.GetTableReader()->ApproximateKeyAnchors(ReadOptions(), anchors));
|
2023-12-06 21:48:15 +00:00
|
|
|
// The target is 128 anchors. But in reality it can be slightly more or
|
|
|
|
// fewer.
|
2022-07-24 00:38:49 +00:00
|
|
|
ASSERT_GT(anchors.size(), 120);
|
|
|
|
ASSERT_LT(anchors.size(), 140);
|
|
|
|
|
|
|
|
// We have around 8000 keys. With 128 anchors, in average 62.5 keys per
|
|
|
|
// anchor. Here we take a rough range and estimate the distance between
|
|
|
|
// anchors is between 50 and 100.
|
|
|
|
// Total data size is about 18,000,000, so each anchor range is about
|
|
|
|
// 140,625. We also take a rough range.
|
|
|
|
int prev_num = 1000;
|
|
|
|
// Non-last anchor
|
|
|
|
for (size_t i = 0; i + 1 < anchors.size(); i++) {
|
|
|
|
auto& anchor = anchors[i];
|
|
|
|
ASSERT_GT(anchor.range_size, 100000);
|
|
|
|
ASSERT_LT(anchor.range_size, 200000);
|
|
|
|
|
|
|
|
// Key might be shortened, so fill 0 in the end if it is the case.
|
|
|
|
std::string key_cpy = anchor.user_key;
|
|
|
|
key_cpy.append(4 - key_cpy.size(), '0');
|
|
|
|
int num = std::stoi(key_cpy);
|
|
|
|
ASSERT_GT(num - prev_num, 50);
|
|
|
|
ASSERT_LT(num - prev_num, 100);
|
|
|
|
prev_num = num;
|
|
|
|
}
|
|
|
|
|
|
|
|
ASSERT_EQ("8999", anchors.back().user_key);
|
|
|
|
ASSERT_LT(anchors.back().range_size, 200000);
|
|
|
|
|
|
|
|
c.ResetTableReader();
|
|
|
|
}
|
|
|
|
|
2021-07-07 18:13:09 +00:00
|
|
|
#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
|
2020-06-13 00:29:22 +00:00
|
|
|
TEST_P(ParameterizedHarnessTest, RandomizedHarnessTest) {
|
|
|
|
Random rnd(test::RandomSeed() + 5);
|
|
|
|
for (int num_entries = 0; num_entries < 2000;
|
|
|
|
num_entries += (num_entries < 50 ? 1 : 200)) {
|
|
|
|
for (int e = 0; e < num_entries; e++) {
|
|
|
|
Add(test::RandomKey(&rnd, rnd.Skewed(4)),
|
2020-07-09 21:33:42 +00:00
|
|
|
rnd.RandomString(rnd.Skewed(5)));
|
2020-06-13 00:29:22 +00:00
|
|
|
}
|
|
|
|
Test(&rnd);
|
|
|
|
}
|
2013-11-10 09:17:32 +00:00
|
|
|
}
|
|
|
|
|
2020-06-13 00:29:22 +00:00
|
|
|
TEST_F(DBHarnessTest, RandomizedLongDB) {
|
2013-11-10 09:17:32 +00:00
|
|
|
Random rnd(test::RandomSeed());
|
|
|
|
int num_entries = 100000;
|
|
|
|
for (int e = 0; e < num_entries; e++) {
|
|
|
|
std::string v;
|
2020-07-09 21:33:42 +00:00
|
|
|
Add(test::RandomKey(&rnd, rnd.Skewed(4)), rnd.RandomString(rnd.Skewed(5)));
|
2013-11-10 09:17:32 +00:00
|
|
|
}
|
|
|
|
Test(&rnd);
|
|
|
|
|
|
|
|
// We must have created enough data to force merging
|
|
|
|
int files = 0;
|
|
|
|
for (int level = 0; level < db()->NumberLevels(); level++) {
|
|
|
|
std::string value;
|
|
|
|
char name[100];
|
|
|
|
snprintf(name, sizeof(name), "rocksdb.num-files-at-level%d", level);
|
|
|
|
ASSERT_TRUE(db()->GetProperty(name, &value));
|
|
|
|
files += atoi(value.c_str());
|
|
|
|
}
|
|
|
|
ASSERT_GT(files, 0);
|
|
|
|
}
|
2021-07-07 18:13:09 +00:00
|
|
|
#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
|
2013-11-10 09:17:32 +00:00
|
|
|
|
2020-10-14 19:43:14 +00:00
|
|
|
class MemTableTest : public testing::Test {
|
|
|
|
public:
|
|
|
|
MemTableTest() {
|
|
|
|
InternalKeyComparator cmp(BytewiseComparator());
|
|
|
|
auto table_factory = std::make_shared<SkipListFactory>();
|
|
|
|
options_.memtable_factory = table_factory;
|
2021-05-05 20:59:21 +00:00
|
|
|
ImmutableOptions ioptions(options_);
|
2020-10-14 19:43:14 +00:00
|
|
|
wb_ = new WriteBufferManager(options_.db_write_buffer_size);
|
|
|
|
memtable_ = new MemTable(cmp, ioptions, MutableCFOptions(options_), wb_,
|
|
|
|
kMaxSequenceNumber, 0 /* column_family_id */);
|
|
|
|
memtable_->Ref();
|
|
|
|
}
|
|
|
|
|
|
|
|
~MemTableTest() {
|
|
|
|
delete memtable_->Unref();
|
|
|
|
delete wb_;
|
|
|
|
}
|
|
|
|
|
|
|
|
MemTable* GetMemTable() { return memtable_; }
|
|
|
|
|
|
|
|
private:
|
|
|
|
MemTable* memtable_;
|
|
|
|
Options options_;
|
|
|
|
WriteBufferManager* wb_;
|
|
|
|
};
|
2013-11-10 09:17:32 +00:00
|
|
|
|
2015-03-17 21:08:00 +00:00
|
|
|
TEST_F(MemTableTest, Simple) {
|
2013-11-10 09:17:32 +00:00
|
|
|
WriteBatch batch;
|
|
|
|
WriteBatchInternal::SetSequence(&batch, 100);
|
2020-10-08 18:11:52 +00:00
|
|
|
ASSERT_OK(batch.Put(std::string("k1"), std::string("v1")));
|
|
|
|
ASSERT_OK(batch.Put(std::string("k2"), std::string("v2")));
|
|
|
|
ASSERT_OK(batch.Put(std::string("k3"), std::string("v3")));
|
|
|
|
ASSERT_OK(batch.Put(std::string("largekey"), std::string("vlarge")));
|
|
|
|
ASSERT_OK(batch.DeleteRange(std::string("chi"), std::string("xigua")));
|
|
|
|
ASSERT_OK(batch.DeleteRange(std::string("begin"), std::string("end")));
|
2020-10-14 19:43:14 +00:00
|
|
|
ColumnFamilyMemTablesDefault cf_mems_default(GetMemTable());
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
2015-08-14 23:59:07 +00:00
|
|
|
ASSERT_TRUE(
|
Refactor trimming logic for immutable memtables (#5022)
Summary:
MyRocks currently sets `max_write_buffer_number_to_maintain` in order to maintain enough history for transaction conflict checking. The effectiveness of this approach depends on the size of memtables. When memtables are small, it may not keep enough history; when memtables are large, this may consume too much memory.
We are proposing a new way to configure memtable list history: by limiting the memory usage of immutable memtables. The new option is `max_write_buffer_size_to_maintain` and it will take precedence over the old `max_write_buffer_number_to_maintain` if they are both set to non-zero values. The new option accounts for the total memory usage of flushed immutable memtables and mutable memtable. When the total usage exceeds the limit, RocksDB may start dropping immutable memtables (which is also called trimming history), starting from the oldest one.
The semantics of the old option actually works both as an upper bound and lower bound. History trimming will start if number of immutable memtables exceeds the limit, but it will never go below (limit-1) due to history trimming.
In order the mimic the behavior with the new option, history trimming will stop if dropping the next immutable memtable causes the total memory usage go below the size limit. For example, assuming the size limit is set to 64MB, and there are 3 immutable memtables with sizes of 20, 30, 30. Although the total memory usage is 80MB > 64MB, dropping the oldest memtable will reduce the memory usage to 60MB < 64MB, so in this case no memtable will be dropped.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5022
Differential Revision: D14394062
Pulled By: miasantreble
fbshipit-source-id: 60457a509c6af89d0993f988c9b5c2aa9e45f5c5
2019-08-23 20:54:09 +00:00
|
|
|
WriteBatchInternal::InsertInto(&batch, &cf_mems_default, nullptr, nullptr)
|
|
|
|
.ok());
|
2013-11-10 09:17:32 +00:00
|
|
|
|
2016-09-12 21:14:40 +00:00
|
|
|
for (int i = 0; i < 2; ++i) {
|
|
|
|
Arena arena;
|
2024-03-22 20:40:42 +00:00
|
|
|
ScopedArenaPtr<InternalIterator> arena_iter_guard;
|
2016-11-19 22:14:35 +00:00
|
|
|
std::unique_ptr<InternalIterator> iter_guard;
|
|
|
|
InternalIterator* iter;
|
|
|
|
if (i == 0) {
|
Steps toward deprecating implicit prefix seek, related fixes (#13026)
Summary:
With some new use cases onboarding to prefix extractors/seek/filters, one of the risks is existing iterator code, e.g. for maintenance tasks, being unintentionally subject to prefix seek semantics. This is a longstanding known design flaw with prefix seek, and `prefix_same_as_start` and `auto_prefix_mode` were steps in the direction of making that obsolete. However, we can't just immediately set `total_order_seek` to true by default, because that would impact so much code instantly.
Here we add a new DB option, `prefix_seek_opt_in_only` that basically allows users to transition to the future behavior when they are ready. When set to true, all iterators will be treated as if `total_order_seek=true` and then the only ways to get prefix seek semantics are with `prefix_same_as_start` or `auto_prefix_mode`.
Related fixes / changes:
* Make sure that `prefix_same_as_start` and `auto_prefix_mode` are compatible with (or override) `total_order_seek` (depending on your interpretation).
* Fix a bug in which a new iterator after dynamically changing the prefix extractor might mix different prefix semantics between memtable and SSTs. Both should use the latest extractor semantics, which means iterators ignoring memtable prefix filters with an old extractor. And that means passing the latest prefix extractor to new memtable iterators that might use prefix seek. (Without the fix, the test added for this fails in many ways.)
Suggested follow-up:
* Investigate a FIXME where a MergeIteratorBuilder is created in db_impl.cc. No unit test detects a change in value that should impact correctness.
* Make memtable prefix bloom compatible with `auto_prefix_mode`, which might require involving the memtablereps because we don't know at iterator creation time (only seek time) whether an auto_prefix_mode seek will be a prefix seek.
* Add `prefix_same_as_start` testing to db_stress
Pull Request resolved: https://github.com/facebook/rocksdb/pull/13026
Test Plan:
tests updated, added. Add combination of `total_order_seek=true` and `auto_prefix_mode=true` to stress test. Ran `make blackbox_crash_test` for a long while.
Manually ran tests with `prefix_seek_opt_in_only=true` as default, looking for unexpected issues. I inspected most of the results and migrated many tests to be ready for such a change (but not all).
Reviewed By: ltamasi
Differential Revision: D63147378
Pulled By: pdillinger
fbshipit-source-id: 1f4477b730683d43b4be7e933338583702d3c25e
2024-09-20 22:54:19 +00:00
|
|
|
iter = GetMemTable()->NewIterator(ReadOptions(),
|
|
|
|
/*seqno_to_time_mapping=*/nullptr,
|
|
|
|
&arena, /*prefix_extractor=*/nullptr);
|
2024-03-22 20:40:42 +00:00
|
|
|
arena_iter_guard.reset(iter);
|
2016-11-19 22:14:35 +00:00
|
|
|
} else {
|
2020-10-14 19:43:14 +00:00
|
|
|
iter = GetMemTable()->NewRangeTombstoneIterator(
|
2022-08-05 19:02:33 +00:00
|
|
|
ReadOptions(), kMaxSequenceNumber /* read_seq */,
|
|
|
|
false /* immutable_memtable */);
|
2016-11-19 22:14:35 +00:00
|
|
|
iter_guard.reset(iter);
|
|
|
|
}
|
2016-11-21 20:07:09 +00:00
|
|
|
if (iter == nullptr) {
|
|
|
|
continue;
|
|
|
|
}
|
2016-09-12 21:14:40 +00:00
|
|
|
iter->SeekToFirst();
|
|
|
|
while (iter->Valid()) {
|
|
|
|
fprintf(stderr, "key: '%s' -> '%s'\n", iter->key().ToString().c_str(),
|
|
|
|
iter->value().ToString().c_str());
|
|
|
|
iter->Next();
|
|
|
|
}
|
2013-11-10 09:17:32 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-12-06 00:51:26 +00:00
|
|
|
// Test the empty key
|
2020-06-13 00:29:22 +00:00
|
|
|
TEST_P(ParameterizedHarnessTest, SimpleEmptyKey) {
|
|
|
|
Random rnd(test::RandomSeed() + 1);
|
|
|
|
Add("", "v");
|
|
|
|
Test(&rnd);
|
2013-12-06 00:51:26 +00:00
|
|
|
}
|
|
|
|
|
2020-06-13 00:29:22 +00:00
|
|
|
TEST_P(ParameterizedHarnessTest, SimpleSingle) {
|
|
|
|
Random rnd(test::RandomSeed() + 2);
|
|
|
|
Add("abc", "v");
|
|
|
|
Test(&rnd);
|
2013-12-06 00:51:26 +00:00
|
|
|
}
|
|
|
|
|
2020-06-13 00:29:22 +00:00
|
|
|
TEST_P(ParameterizedHarnessTest, SimpleMulti) {
|
|
|
|
Random rnd(test::RandomSeed() + 3);
|
|
|
|
Add("abc", "v");
|
|
|
|
Add("abcd", "v");
|
|
|
|
Add("ac", "v2");
|
|
|
|
Test(&rnd);
|
2013-12-06 00:51:26 +00:00
|
|
|
}
|
|
|
|
|
2020-06-13 00:29:22 +00:00
|
|
|
TEST_P(ParameterizedHarnessTest, SimpleSpecialKey) {
|
|
|
|
Random rnd(test::RandomSeed() + 4);
|
|
|
|
Add("\xff\xff", "v3");
|
|
|
|
Test(&rnd);
|
2013-12-06 00:51:26 +00:00
|
|
|
}
|
2013-11-10 09:17:32 +00:00
|
|
|
|
2020-06-13 00:29:22 +00:00
|
|
|
TEST(TableTest, FooterTests) {
|
2021-12-10 16:12:09 +00:00
|
|
|
Random* r = Random::GetTLSInstance();
|
|
|
|
uint64_t data_size = (uint64_t{1} << r->Uniform(40)) + r->Uniform(100);
|
|
|
|
uint64_t index_size = r->Uniform(1000000000);
|
|
|
|
uint64_t metaindex_size = r->Uniform(1000000);
|
|
|
|
// 5 == block trailer size
|
|
|
|
BlockHandle index(data_size + 5, index_size);
|
|
|
|
BlockHandle meta_index(data_size + index_size + 2 * 5, metaindex_size);
|
|
|
|
uint64_t footer_offset = data_size + metaindex_size + index_size + 3 * 5;
|
format_version=6 and context-aware block checksums (#9058)
Summary:
## Context checksum
All RocksDB checksums currently use 32 bits of checking
power, which should be 1 in 4 billion false negative (FN) probability (failing to
detect corruption). This is true for random corruptions, and in some cases
small corruptions are guaranteed to be detected. But some possible
corruptions, such as in storage metadata rather than storage payload data,
would have a much higher FN rate. For example:
* Data larger than one SST block is replaced by data from elsewhere in
the same or another SST file. Especially with block_align=true, the
probability of exact block size match is probably around 1 in 100, making
the FN probability around that same. Without `block_align=true` the
probability of same block start location is probably around 1 in 10,000,
for FN probability around 1 in a million.
To solve this problem in new format_version=6, we add "context awareness"
to block checksum checks. The stored and expected checksum value is
modified based on the block's position in the file and which file it is in. The
modifications are cleverly chosen so that, for example
* blocks within about 4GB of each other are guaranteed to use different context
* blocks that are offset by exactly some multiple of 4GiB are guaranteed to use
different context
* files generated by the same process are guaranteed to use different context
for the same offsets, until wrap-around after 2^32 - 1 files
Thus, with format_version=6, if a valid SST block and checksum is misplaced,
its checksum FN probability should be essentially ideal, 1 in 4B.
## Footer checksum
This change also adds checksum protection to the SST footer (with
format_version=6), for the first time without relying on whole file checksum.
To prevent a corruption of the format_version in the footer (e.g. 6 -> 5) to
defeat the footer checksum, we change much of the footer data format
including an "extended magic number" in format_version 6 that would be
interpreted as empty index and metaindex block handles in older footer
versions. We also change the encoding of handles to free up space for
other new data in footer.
## More detail: making space in footer
In order to keep footer the same size in format_version=6 (avoid change to IO
patterns), we have to free up some space for new data. We do this two ways:
* Metaindex block handle is encoded down to 4 bytes (from 10) by assuming
it immediately precedes the footer, and by assuming it is < 4GB.
* Index block handle is moved into metaindex. (I don't know why it was
in footer to begin with.)
## Performance
In case of small performance penalty, I've made a "pay as you go" optimization
to compensate: replace `MutableCFOptions` in BlockBasedTableBuilder::Rep
with the only field used in that structure after construction: `prefix_extractor`.
This makes the PR an overall performance improvement (results below).
Nevertheless I'm seeing essentially no difference going from fv=5 to fv=6,
even including that improvement for both. That's based on extreme case table
write performance testing, many files with many blocks. This is relatively
checksum intensive (small blocks) and salt generation intensive (small files).
```
(for I in `seq 1 100`; do TEST_TMPDIR=/dev/shm/dbbench2 ./db_bench -benchmarks=fillseq -memtablerep=vector -disable_wal=1 -allow_concurrent_memtable_write=false -num=3000000 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=0 -write_buffer_size=100000 -compression_type=none -block_size=1000; done) 2>&1 | grep micros/op | tee out
awk '{ tot += $5; n += 1; } END { print int(1.0 * tot / n) }' < out
```
Each value below is ops/s averaged over 100 runs, run simultaneously with competing
configuration for load fairness
Before -> after (both fv=5): 483530 -> 483673 (negligible)
Re-run 1: 480733 -> 485427 (1.0% faster)
Re-run 2: 483821 -> 484541 (0.1% faster)
Before (fv=5) -> after (fv=6): 482006 -> 485100 (0.6% faster)
Re-run 1: 482212 -> 485075 (0.6% faster)
Re-run 2: 483590 -> 484073 (0.1% faster)
After fv=5 -> after fv=6: 483878 -> 485542 (0.3% faster)
Re-run 1: 485331 -> 483385 (0.4% slower)
Re-run 2: 485283 -> 483435 (0.4% slower)
Re-run 3: 483647 -> 486109 (0.5% faster)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9058
Test Plan:
unit tests included (table_test, db_properties_test, salt in env_test). General DB tests
and crash test updated to test new format_version.
Also temporarily updated the default format version to 6 and saw some test failures. Almost all
were due to an inadvertent additional read in VerifyChecksum to verify the index block checksum,
though it's arguably a bug that VerifyChecksum does not appear to (re-)verify the index block
checksum, just assuming it was verified in opening the index reader (probably *usually* true but
probably not always true). Some other concerns about VerifyChecksum are left in FIXME
comments. The only remaining test failure on change of default (in block_fetcher_test) now
has a comment about how to upgrade the test.
The format compatibility test does not need updating because we have not updated the default
format_version.
Reviewed By: ajkr, mrambacher
Differential Revision: D33100915
Pulled By: pdillinger
fbshipit-source-id: 8679e3e572fa580181a737fd6d113ed53c5422ee
2023-07-30 23:40:01 +00:00
|
|
|
uint32_t base_context_checksum = 123456789;
|
2014-05-01 18:09:32 +00:00
|
|
|
{
|
2021-12-14 01:42:05 +00:00
|
|
|
// legacy block based
|
|
|
|
FooterBuilder footer;
|
format_version=6 and context-aware block checksums (#9058)
Summary:
## Context checksum
All RocksDB checksums currently use 32 bits of checking
power, which should be 1 in 4 billion false negative (FN) probability (failing to
detect corruption). This is true for random corruptions, and in some cases
small corruptions are guaranteed to be detected. But some possible
corruptions, such as in storage metadata rather than storage payload data,
would have a much higher FN rate. For example:
* Data larger than one SST block is replaced by data from elsewhere in
the same or another SST file. Especially with block_align=true, the
probability of exact block size match is probably around 1 in 100, making
the FN probability around that same. Without `block_align=true` the
probability of same block start location is probably around 1 in 10,000,
for FN probability around 1 in a million.
To solve this problem in new format_version=6, we add "context awareness"
to block checksum checks. The stored and expected checksum value is
modified based on the block's position in the file and which file it is in. The
modifications are cleverly chosen so that, for example
* blocks within about 4GB of each other are guaranteed to use different context
* blocks that are offset by exactly some multiple of 4GiB are guaranteed to use
different context
* files generated by the same process are guaranteed to use different context
for the same offsets, until wrap-around after 2^32 - 1 files
Thus, with format_version=6, if a valid SST block and checksum is misplaced,
its checksum FN probability should be essentially ideal, 1 in 4B.
## Footer checksum
This change also adds checksum protection to the SST footer (with
format_version=6), for the first time without relying on whole file checksum.
To prevent a corruption of the format_version in the footer (e.g. 6 -> 5) to
defeat the footer checksum, we change much of the footer data format
including an "extended magic number" in format_version 6 that would be
interpreted as empty index and metaindex block handles in older footer
versions. We also change the encoding of handles to free up space for
other new data in footer.
## More detail: making space in footer
In order to keep footer the same size in format_version=6 (avoid change to IO
patterns), we have to free up some space for new data. We do this two ways:
* Metaindex block handle is encoded down to 4 bytes (from 10) by assuming
it immediately precedes the footer, and by assuming it is < 4GB.
* Index block handle is moved into metaindex. (I don't know why it was
in footer to begin with.)
## Performance
In case of small performance penalty, I've made a "pay as you go" optimization
to compensate: replace `MutableCFOptions` in BlockBasedTableBuilder::Rep
with the only field used in that structure after construction: `prefix_extractor`.
This makes the PR an overall performance improvement (results below).
Nevertheless I'm seeing essentially no difference going from fv=5 to fv=6,
even including that improvement for both. That's based on extreme case table
write performance testing, many files with many blocks. This is relatively
checksum intensive (small blocks) and salt generation intensive (small files).
```
(for I in `seq 1 100`; do TEST_TMPDIR=/dev/shm/dbbench2 ./db_bench -benchmarks=fillseq -memtablerep=vector -disable_wal=1 -allow_concurrent_memtable_write=false -num=3000000 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=0 -write_buffer_size=100000 -compression_type=none -block_size=1000; done) 2>&1 | grep micros/op | tee out
awk '{ tot += $5; n += 1; } END { print int(1.0 * tot / n) }' < out
```
Each value below is ops/s averaged over 100 runs, run simultaneously with competing
configuration for load fairness
Before -> after (both fv=5): 483530 -> 483673 (negligible)
Re-run 1: 480733 -> 485427 (1.0% faster)
Re-run 2: 483821 -> 484541 (0.1% faster)
Before (fv=5) -> after (fv=6): 482006 -> 485100 (0.6% faster)
Re-run 1: 482212 -> 485075 (0.6% faster)
Re-run 2: 483590 -> 484073 (0.1% faster)
After fv=5 -> after fv=6: 483878 -> 485542 (0.3% faster)
Re-run 1: 485331 -> 483385 (0.4% slower)
Re-run 2: 485283 -> 483435 (0.4% slower)
Re-run 3: 483647 -> 486109 (0.5% faster)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9058
Test Plan:
unit tests included (table_test, db_properties_test, salt in env_test). General DB tests
and crash test updated to test new format_version.
Also temporarily updated the default format version to 6 and saw some test failures. Almost all
were due to an inadvertent additional read in VerifyChecksum to verify the index block checksum,
though it's arguably a bug that VerifyChecksum does not appear to (re-)verify the index block
checksum, just assuming it was verified in opening the index reader (probably *usually* true but
probably not always true). Some other concerns about VerifyChecksum are left in FIXME
comments. The only remaining test failure on change of default (in block_fetcher_test) now
has a comment about how to upgrade the test.
The format compatibility test does not need updating because we have not updated the default
format_version.
Reviewed By: ajkr, mrambacher
Differential Revision: D33100915
Pulled By: pdillinger
fbshipit-source-id: 8679e3e572fa580181a737fd6d113ed53c5422ee
2023-07-30 23:40:01 +00:00
|
|
|
ASSERT_OK(footer.Build(kBlockBasedTableMagicNumber, /* format_version */ 0,
|
|
|
|
footer_offset, kCRC32c, meta_index, index));
|
2014-05-01 18:09:32 +00:00
|
|
|
Footer decoded_footer;
|
2021-12-14 01:42:05 +00:00
|
|
|
ASSERT_OK(decoded_footer.DecodeFrom(footer.GetSlice(), footer_offset));
|
2014-05-01 18:09:32 +00:00
|
|
|
ASSERT_EQ(decoded_footer.table_magic_number(), kBlockBasedTableMagicNumber);
|
2021-12-10 16:12:09 +00:00
|
|
|
ASSERT_EQ(decoded_footer.checksum_type(), kCRC32c);
|
2014-05-01 18:09:32 +00:00
|
|
|
ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset());
|
|
|
|
ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size());
|
|
|
|
ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset());
|
|
|
|
ASSERT_EQ(decoded_footer.index_handle().size(), index.size());
|
2021-12-10 16:12:09 +00:00
|
|
|
ASSERT_EQ(decoded_footer.format_version(), 0U);
|
format_version=6 and context-aware block checksums (#9058)
Summary:
## Context checksum
All RocksDB checksums currently use 32 bits of checking
power, which should be 1 in 4 billion false negative (FN) probability (failing to
detect corruption). This is true for random corruptions, and in some cases
small corruptions are guaranteed to be detected. But some possible
corruptions, such as in storage metadata rather than storage payload data,
would have a much higher FN rate. For example:
* Data larger than one SST block is replaced by data from elsewhere in
the same or another SST file. Especially with block_align=true, the
probability of exact block size match is probably around 1 in 100, making
the FN probability around that same. Without `block_align=true` the
probability of same block start location is probably around 1 in 10,000,
for FN probability around 1 in a million.
To solve this problem in new format_version=6, we add "context awareness"
to block checksum checks. The stored and expected checksum value is
modified based on the block's position in the file and which file it is in. The
modifications are cleverly chosen so that, for example
* blocks within about 4GB of each other are guaranteed to use different context
* blocks that are offset by exactly some multiple of 4GiB are guaranteed to use
different context
* files generated by the same process are guaranteed to use different context
for the same offsets, until wrap-around after 2^32 - 1 files
Thus, with format_version=6, if a valid SST block and checksum is misplaced,
its checksum FN probability should be essentially ideal, 1 in 4B.
## Footer checksum
This change also adds checksum protection to the SST footer (with
format_version=6), for the first time without relying on whole file checksum.
To prevent a corruption of the format_version in the footer (e.g. 6 -> 5) to
defeat the footer checksum, we change much of the footer data format
including an "extended magic number" in format_version 6 that would be
interpreted as empty index and metaindex block handles in older footer
versions. We also change the encoding of handles to free up space for
other new data in footer.
## More detail: making space in footer
In order to keep footer the same size in format_version=6 (avoid change to IO
patterns), we have to free up some space for new data. We do this two ways:
* Metaindex block handle is encoded down to 4 bytes (from 10) by assuming
it immediately precedes the footer, and by assuming it is < 4GB.
* Index block handle is moved into metaindex. (I don't know why it was
in footer to begin with.)
## Performance
In case of small performance penalty, I've made a "pay as you go" optimization
to compensate: replace `MutableCFOptions` in BlockBasedTableBuilder::Rep
with the only field used in that structure after construction: `prefix_extractor`.
This makes the PR an overall performance improvement (results below).
Nevertheless I'm seeing essentially no difference going from fv=5 to fv=6,
even including that improvement for both. That's based on extreme case table
write performance testing, many files with many blocks. This is relatively
checksum intensive (small blocks) and salt generation intensive (small files).
```
(for I in `seq 1 100`; do TEST_TMPDIR=/dev/shm/dbbench2 ./db_bench -benchmarks=fillseq -memtablerep=vector -disable_wal=1 -allow_concurrent_memtable_write=false -num=3000000 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=0 -write_buffer_size=100000 -compression_type=none -block_size=1000; done) 2>&1 | grep micros/op | tee out
awk '{ tot += $5; n += 1; } END { print int(1.0 * tot / n) }' < out
```
Each value below is ops/s averaged over 100 runs, run simultaneously with competing
configuration for load fairness
Before -> after (both fv=5): 483530 -> 483673 (negligible)
Re-run 1: 480733 -> 485427 (1.0% faster)
Re-run 2: 483821 -> 484541 (0.1% faster)
Before (fv=5) -> after (fv=6): 482006 -> 485100 (0.6% faster)
Re-run 1: 482212 -> 485075 (0.6% faster)
Re-run 2: 483590 -> 484073 (0.1% faster)
After fv=5 -> after fv=6: 483878 -> 485542 (0.3% faster)
Re-run 1: 485331 -> 483385 (0.4% slower)
Re-run 2: 485283 -> 483435 (0.4% slower)
Re-run 3: 483647 -> 486109 (0.5% faster)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9058
Test Plan:
unit tests included (table_test, db_properties_test, salt in env_test). General DB tests
and crash test updated to test new format_version.
Also temporarily updated the default format version to 6 and saw some test failures. Almost all
were due to an inadvertent additional read in VerifyChecksum to verify the index block checksum,
though it's arguably a bug that VerifyChecksum does not appear to (re-)verify the index block
checksum, just assuming it was verified in opening the index reader (probably *usually* true but
probably not always true). Some other concerns about VerifyChecksum are left in FIXME
comments. The only remaining test failure on change of default (in block_fetcher_test) now
has a comment about how to upgrade the test.
The format compatibility test does not need updating because we have not updated the default
format_version.
Reviewed By: ajkr, mrambacher
Differential Revision: D33100915
Pulled By: pdillinger
fbshipit-source-id: 8679e3e572fa580181a737fd6d113ed53c5422ee
2023-07-30 23:40:01 +00:00
|
|
|
ASSERT_EQ(decoded_footer.base_context_checksum(), 0U);
|
2021-12-10 16:12:09 +00:00
|
|
|
ASSERT_EQ(decoded_footer.GetBlockTrailerSize(), 5U);
|
2021-12-14 01:42:05 +00:00
|
|
|
// Ensure serialized with legacy magic
|
|
|
|
ASSERT_EQ(
|
|
|
|
DecodeFixed64(footer.GetSlice().data() + footer.GetSlice().size() - 8),
|
|
|
|
kLegacyBlockBasedTableMagicNumber);
|
2014-05-01 18:09:32 +00:00
|
|
|
}
|
2021-12-10 16:12:09 +00:00
|
|
|
// block based, various checksums, various versions
|
Implement XXH3 block checksum type (#9069)
Summary:
XXH3 - latest hash function that is extremely fast on large
data, easily faster than crc32c on most any x86_64 hardware. In
integrating this hash function, I have handled the compression type byte
in a non-standard way to avoid using the streaming API (extra data
movement and active code size because of hash function complexity). This
approach got a thumbs-up from Yann Collet.
Existing functionality change:
* reject bad ChecksumType in options with InvalidArgument
This change split off from https://github.com/facebook/rocksdb/issues/9058 because context-aware checksum is
likely to be handled through different configuration than ChecksumType.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9069
Test Plan:
tests updated, and substantially expanded. Unit tests now check
that we don't accidentally change the values generated by the checksum
algorithms ("schema test") and that we properly handle
invalid/unrecognized checksum types in options or in file footer.
DBTestBase::ChangeOptions (etc.) updated from two to one configuration
changing from default CRC32c ChecksumType. The point of this test code
is to detect possible interactions among features, and the likelihood of
some bad interaction being detected by including configurations other
than XXH3 and CRC32c--and then not detected by stress/crash test--is
extremely low.
Stress/crash test also updated (manual run long enough to see it accepts
new checksum type). db_bench also updated for microbenchmarking
checksums.
### Performance microbenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
./db_bench -benchmarks=crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3
crc32c : 0.200 micros/op 5005220 ops/sec; 19551.6 MB/s (4096 per op)
xxhash : 0.807 micros/op 1238408 ops/sec; 4837.5 MB/s (4096 per op)
xxhash64 : 0.421 micros/op 2376514 ops/sec; 9283.3 MB/s (4096 per op)
xxh3 : 0.171 micros/op 5858391 ops/sec; 22884.3 MB/s (4096 per op)
crc32c : 0.206 micros/op 4859566 ops/sec; 18982.7 MB/s (4096 per op)
xxhash : 0.793 micros/op 1260850 ops/sec; 4925.2 MB/s (4096 per op)
xxhash64 : 0.410 micros/op 2439182 ops/sec; 9528.1 MB/s (4096 per op)
xxh3 : 0.161 micros/op 6202872 ops/sec; 24230.0 MB/s (4096 per op)
crc32c : 0.203 micros/op 4924686 ops/sec; 19237.1 MB/s (4096 per op)
xxhash : 0.839 micros/op 1192388 ops/sec; 4657.8 MB/s (4096 per op)
xxhash64 : 0.424 micros/op 2357391 ops/sec; 9208.6 MB/s (4096 per op)
xxh3 : 0.162 micros/op 6182678 ops/sec; 24151.1 MB/s (4096 per op)
As you can see, especially once warmed up, xxh3 is fastest.
### Performance macrobenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
Test
for I in `seq 1 50`; do for CHK in 0 1 2 3 4; do TEST_TMPDIR=/dev/shm/rocksdb$CHK ./db_bench -benchmarks=fillseq -memtablerep=vector -allow_concurrent_memtable_write=false -num=30000000 -checksum_type=$CHK 2>&1 | grep 'micros/op' | tee -a results-$CHK & done; wait; done
Results (ops/sec)
for FILE in results*; do echo -n "$FILE "; awk '{ s += $5; c++; } END { print 1.0 * s / c; }' < $FILE; done
results-0 252118 # kNoChecksum
results-1 251588 # kCRC32c
results-2 251863 # kxxHash
results-3 252016 # kxxHash64
results-4 252038 # kXXH3
Reviewed By: mrambacher
Differential Revision: D31905249
Pulled By: pdillinger
fbshipit-source-id: cb9b998ebe2523fc7c400eedf62124a78bf4b4d1
2021-10-29 05:13:47 +00:00
|
|
|
for (auto t : GetSupportedChecksums()) {
|
2021-12-10 16:12:09 +00:00
|
|
|
for (uint32_t fv = 1; IsSupportedFormatVersion(fv); ++fv) {
|
format_version=6 and context-aware block checksums (#9058)
Summary:
## Context checksum
All RocksDB checksums currently use 32 bits of checking
power, which should be 1 in 4 billion false negative (FN) probability (failing to
detect corruption). This is true for random corruptions, and in some cases
small corruptions are guaranteed to be detected. But some possible
corruptions, such as in storage metadata rather than storage payload data,
would have a much higher FN rate. For example:
* Data larger than one SST block is replaced by data from elsewhere in
the same or another SST file. Especially with block_align=true, the
probability of exact block size match is probably around 1 in 100, making
the FN probability around that same. Without `block_align=true` the
probability of same block start location is probably around 1 in 10,000,
for FN probability around 1 in a million.
To solve this problem in new format_version=6, we add "context awareness"
to block checksum checks. The stored and expected checksum value is
modified based on the block's position in the file and which file it is in. The
modifications are cleverly chosen so that, for example
* blocks within about 4GB of each other are guaranteed to use different context
* blocks that are offset by exactly some multiple of 4GiB are guaranteed to use
different context
* files generated by the same process are guaranteed to use different context
for the same offsets, until wrap-around after 2^32 - 1 files
Thus, with format_version=6, if a valid SST block and checksum is misplaced,
its checksum FN probability should be essentially ideal, 1 in 4B.
## Footer checksum
This change also adds checksum protection to the SST footer (with
format_version=6), for the first time without relying on whole file checksum.
To prevent a corruption of the format_version in the footer (e.g. 6 -> 5) to
defeat the footer checksum, we change much of the footer data format
including an "extended magic number" in format_version 6 that would be
interpreted as empty index and metaindex block handles in older footer
versions. We also change the encoding of handles to free up space for
other new data in footer.
## More detail: making space in footer
In order to keep footer the same size in format_version=6 (avoid change to IO
patterns), we have to free up some space for new data. We do this two ways:
* Metaindex block handle is encoded down to 4 bytes (from 10) by assuming
it immediately precedes the footer, and by assuming it is < 4GB.
* Index block handle is moved into metaindex. (I don't know why it was
in footer to begin with.)
## Performance
In case of small performance penalty, I've made a "pay as you go" optimization
to compensate: replace `MutableCFOptions` in BlockBasedTableBuilder::Rep
with the only field used in that structure after construction: `prefix_extractor`.
This makes the PR an overall performance improvement (results below).
Nevertheless I'm seeing essentially no difference going from fv=5 to fv=6,
even including that improvement for both. That's based on extreme case table
write performance testing, many files with many blocks. This is relatively
checksum intensive (small blocks) and salt generation intensive (small files).
```
(for I in `seq 1 100`; do TEST_TMPDIR=/dev/shm/dbbench2 ./db_bench -benchmarks=fillseq -memtablerep=vector -disable_wal=1 -allow_concurrent_memtable_write=false -num=3000000 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=0 -write_buffer_size=100000 -compression_type=none -block_size=1000; done) 2>&1 | grep micros/op | tee out
awk '{ tot += $5; n += 1; } END { print int(1.0 * tot / n) }' < out
```
Each value below is ops/s averaged over 100 runs, run simultaneously with competing
configuration for load fairness
Before -> after (both fv=5): 483530 -> 483673 (negligible)
Re-run 1: 480733 -> 485427 (1.0% faster)
Re-run 2: 483821 -> 484541 (0.1% faster)
Before (fv=5) -> after (fv=6): 482006 -> 485100 (0.6% faster)
Re-run 1: 482212 -> 485075 (0.6% faster)
Re-run 2: 483590 -> 484073 (0.1% faster)
After fv=5 -> after fv=6: 483878 -> 485542 (0.3% faster)
Re-run 1: 485331 -> 483385 (0.4% slower)
Re-run 2: 485283 -> 483435 (0.4% slower)
Re-run 3: 483647 -> 486109 (0.5% faster)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9058
Test Plan:
unit tests included (table_test, db_properties_test, salt in env_test). General DB tests
and crash test updated to test new format_version.
Also temporarily updated the default format version to 6 and saw some test failures. Almost all
were due to an inadvertent additional read in VerifyChecksum to verify the index block checksum,
though it's arguably a bug that VerifyChecksum does not appear to (re-)verify the index block
checksum, just assuming it was verified in opening the index reader (probably *usually* true but
probably not always true). Some other concerns about VerifyChecksum are left in FIXME
comments. The only remaining test failure on change of default (in block_fetcher_test) now
has a comment about how to upgrade the test.
The format compatibility test does not need updating because we have not updated the default
format_version.
Reviewed By: ajkr, mrambacher
Differential Revision: D33100915
Pulled By: pdillinger
fbshipit-source-id: 8679e3e572fa580181a737fd6d113ed53c5422ee
2023-07-30 23:40:01 +00:00
|
|
|
uint32_t maybe_bcc =
|
|
|
|
FormatVersionUsesContextChecksum(fv) ? base_context_checksum : 0U;
|
2021-12-14 01:42:05 +00:00
|
|
|
FooterBuilder footer;
|
format_version=6 and context-aware block checksums (#9058)
Summary:
## Context checksum
All RocksDB checksums currently use 32 bits of checking
power, which should be 1 in 4 billion false negative (FN) probability (failing to
detect corruption). This is true for random corruptions, and in some cases
small corruptions are guaranteed to be detected. But some possible
corruptions, such as in storage metadata rather than storage payload data,
would have a much higher FN rate. For example:
* Data larger than one SST block is replaced by data from elsewhere in
the same or another SST file. Especially with block_align=true, the
probability of exact block size match is probably around 1 in 100, making
the FN probability around that same. Without `block_align=true` the
probability of same block start location is probably around 1 in 10,000,
for FN probability around 1 in a million.
To solve this problem in new format_version=6, we add "context awareness"
to block checksum checks. The stored and expected checksum value is
modified based on the block's position in the file and which file it is in. The
modifications are cleverly chosen so that, for example
* blocks within about 4GB of each other are guaranteed to use different context
* blocks that are offset by exactly some multiple of 4GiB are guaranteed to use
different context
* files generated by the same process are guaranteed to use different context
for the same offsets, until wrap-around after 2^32 - 1 files
Thus, with format_version=6, if a valid SST block and checksum is misplaced,
its checksum FN probability should be essentially ideal, 1 in 4B.
## Footer checksum
This change also adds checksum protection to the SST footer (with
format_version=6), for the first time without relying on whole file checksum.
To prevent a corruption of the format_version in the footer (e.g. 6 -> 5) to
defeat the footer checksum, we change much of the footer data format
including an "extended magic number" in format_version 6 that would be
interpreted as empty index and metaindex block handles in older footer
versions. We also change the encoding of handles to free up space for
other new data in footer.
## More detail: making space in footer
In order to keep footer the same size in format_version=6 (avoid change to IO
patterns), we have to free up some space for new data. We do this two ways:
* Metaindex block handle is encoded down to 4 bytes (from 10) by assuming
it immediately precedes the footer, and by assuming it is < 4GB.
* Index block handle is moved into metaindex. (I don't know why it was
in footer to begin with.)
## Performance
In case of small performance penalty, I've made a "pay as you go" optimization
to compensate: replace `MutableCFOptions` in BlockBasedTableBuilder::Rep
with the only field used in that structure after construction: `prefix_extractor`.
This makes the PR an overall performance improvement (results below).
Nevertheless I'm seeing essentially no difference going from fv=5 to fv=6,
even including that improvement for both. That's based on extreme case table
write performance testing, many files with many blocks. This is relatively
checksum intensive (small blocks) and salt generation intensive (small files).
```
(for I in `seq 1 100`; do TEST_TMPDIR=/dev/shm/dbbench2 ./db_bench -benchmarks=fillseq -memtablerep=vector -disable_wal=1 -allow_concurrent_memtable_write=false -num=3000000 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=0 -write_buffer_size=100000 -compression_type=none -block_size=1000; done) 2>&1 | grep micros/op | tee out
awk '{ tot += $5; n += 1; } END { print int(1.0 * tot / n) }' < out
```
Each value below is ops/s averaged over 100 runs, run simultaneously with competing
configuration for load fairness
Before -> after (both fv=5): 483530 -> 483673 (negligible)
Re-run 1: 480733 -> 485427 (1.0% faster)
Re-run 2: 483821 -> 484541 (0.1% faster)
Before (fv=5) -> after (fv=6): 482006 -> 485100 (0.6% faster)
Re-run 1: 482212 -> 485075 (0.6% faster)
Re-run 2: 483590 -> 484073 (0.1% faster)
After fv=5 -> after fv=6: 483878 -> 485542 (0.3% faster)
Re-run 1: 485331 -> 483385 (0.4% slower)
Re-run 2: 485283 -> 483435 (0.4% slower)
Re-run 3: 483647 -> 486109 (0.5% faster)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9058
Test Plan:
unit tests included (table_test, db_properties_test, salt in env_test). General DB tests
and crash test updated to test new format_version.
Also temporarily updated the default format version to 6 and saw some test failures. Almost all
were due to an inadvertent additional read in VerifyChecksum to verify the index block checksum,
though it's arguably a bug that VerifyChecksum does not appear to (re-)verify the index block
checksum, just assuming it was verified in opening the index reader (probably *usually* true but
probably not always true). Some other concerns about VerifyChecksum are left in FIXME
comments. The only remaining test failure on change of default (in block_fetcher_test) now
has a comment about how to upgrade the test.
The format compatibility test does not need updating because we have not updated the default
format_version.
Reviewed By: ajkr, mrambacher
Differential Revision: D33100915
Pulled By: pdillinger
fbshipit-source-id: 8679e3e572fa580181a737fd6d113ed53c5422ee
2023-07-30 23:40:01 +00:00
|
|
|
ASSERT_OK(footer.Build(kBlockBasedTableMagicNumber, fv, footer_offset, t,
|
|
|
|
meta_index, index, maybe_bcc));
|
2021-12-10 16:12:09 +00:00
|
|
|
Footer decoded_footer;
|
2021-12-14 01:42:05 +00:00
|
|
|
ASSERT_OK(decoded_footer.DecodeFrom(footer.GetSlice(), footer_offset));
|
2021-12-10 16:12:09 +00:00
|
|
|
ASSERT_EQ(decoded_footer.table_magic_number(),
|
|
|
|
kBlockBasedTableMagicNumber);
|
|
|
|
ASSERT_EQ(decoded_footer.checksum_type(), t);
|
|
|
|
ASSERT_EQ(decoded_footer.metaindex_handle().offset(),
|
|
|
|
meta_index.offset());
|
|
|
|
ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size());
|
format_version=6 and context-aware block checksums (#9058)
Summary:
## Context checksum
All RocksDB checksums currently use 32 bits of checking
power, which should be 1 in 4 billion false negative (FN) probability (failing to
detect corruption). This is true for random corruptions, and in some cases
small corruptions are guaranteed to be detected. But some possible
corruptions, such as in storage metadata rather than storage payload data,
would have a much higher FN rate. For example:
* Data larger than one SST block is replaced by data from elsewhere in
the same or another SST file. Especially with block_align=true, the
probability of exact block size match is probably around 1 in 100, making
the FN probability around that same. Without `block_align=true` the
probability of same block start location is probably around 1 in 10,000,
for FN probability around 1 in a million.
To solve this problem in new format_version=6, we add "context awareness"
to block checksum checks. The stored and expected checksum value is
modified based on the block's position in the file and which file it is in. The
modifications are cleverly chosen so that, for example
* blocks within about 4GB of each other are guaranteed to use different context
* blocks that are offset by exactly some multiple of 4GiB are guaranteed to use
different context
* files generated by the same process are guaranteed to use different context
for the same offsets, until wrap-around after 2^32 - 1 files
Thus, with format_version=6, if a valid SST block and checksum is misplaced,
its checksum FN probability should be essentially ideal, 1 in 4B.
## Footer checksum
This change also adds checksum protection to the SST footer (with
format_version=6), for the first time without relying on whole file checksum.
To prevent a corruption of the format_version in the footer (e.g. 6 -> 5) to
defeat the footer checksum, we change much of the footer data format
including an "extended magic number" in format_version 6 that would be
interpreted as empty index and metaindex block handles in older footer
versions. We also change the encoding of handles to free up space for
other new data in footer.
## More detail: making space in footer
In order to keep footer the same size in format_version=6 (avoid change to IO
patterns), we have to free up some space for new data. We do this two ways:
* Metaindex block handle is encoded down to 4 bytes (from 10) by assuming
it immediately precedes the footer, and by assuming it is < 4GB.
* Index block handle is moved into metaindex. (I don't know why it was
in footer to begin with.)
## Performance
In case of small performance penalty, I've made a "pay as you go" optimization
to compensate: replace `MutableCFOptions` in BlockBasedTableBuilder::Rep
with the only field used in that structure after construction: `prefix_extractor`.
This makes the PR an overall performance improvement (results below).
Nevertheless I'm seeing essentially no difference going from fv=5 to fv=6,
even including that improvement for both. That's based on extreme case table
write performance testing, many files with many blocks. This is relatively
checksum intensive (small blocks) and salt generation intensive (small files).
```
(for I in `seq 1 100`; do TEST_TMPDIR=/dev/shm/dbbench2 ./db_bench -benchmarks=fillseq -memtablerep=vector -disable_wal=1 -allow_concurrent_memtable_write=false -num=3000000 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=0 -write_buffer_size=100000 -compression_type=none -block_size=1000; done) 2>&1 | grep micros/op | tee out
awk '{ tot += $5; n += 1; } END { print int(1.0 * tot / n) }' < out
```
Each value below is ops/s averaged over 100 runs, run simultaneously with competing
configuration for load fairness
Before -> after (both fv=5): 483530 -> 483673 (negligible)
Re-run 1: 480733 -> 485427 (1.0% faster)
Re-run 2: 483821 -> 484541 (0.1% faster)
Before (fv=5) -> after (fv=6): 482006 -> 485100 (0.6% faster)
Re-run 1: 482212 -> 485075 (0.6% faster)
Re-run 2: 483590 -> 484073 (0.1% faster)
After fv=5 -> after fv=6: 483878 -> 485542 (0.3% faster)
Re-run 1: 485331 -> 483385 (0.4% slower)
Re-run 2: 485283 -> 483435 (0.4% slower)
Re-run 3: 483647 -> 486109 (0.5% faster)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9058
Test Plan:
unit tests included (table_test, db_properties_test, salt in env_test). General DB tests
and crash test updated to test new format_version.
Also temporarily updated the default format version to 6 and saw some test failures. Almost all
were due to an inadvertent additional read in VerifyChecksum to verify the index block checksum,
though it's arguably a bug that VerifyChecksum does not appear to (re-)verify the index block
checksum, just assuming it was verified in opening the index reader (probably *usually* true but
probably not always true). Some other concerns about VerifyChecksum are left in FIXME
comments. The only remaining test failure on change of default (in block_fetcher_test) now
has a comment about how to upgrade the test.
The format compatibility test does not need updating because we have not updated the default
format_version.
Reviewed By: ajkr, mrambacher
Differential Revision: D33100915
Pulled By: pdillinger
fbshipit-source-id: 8679e3e572fa580181a737fd6d113ed53c5422ee
2023-07-30 23:40:01 +00:00
|
|
|
if (FormatVersionUsesIndexHandleInFooter(fv)) {
|
|
|
|
ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset());
|
|
|
|
ASSERT_EQ(decoded_footer.index_handle().size(), index.size());
|
|
|
|
}
|
2021-12-10 16:12:09 +00:00
|
|
|
ASSERT_EQ(decoded_footer.format_version(), fv);
|
|
|
|
ASSERT_EQ(decoded_footer.GetBlockTrailerSize(), 5U);
|
format_version=6 and context-aware block checksums (#9058)
Summary:
## Context checksum
All RocksDB checksums currently use 32 bits of checking
power, which should be 1 in 4 billion false negative (FN) probability (failing to
detect corruption). This is true for random corruptions, and in some cases
small corruptions are guaranteed to be detected. But some possible
corruptions, such as in storage metadata rather than storage payload data,
would have a much higher FN rate. For example:
* Data larger than one SST block is replaced by data from elsewhere in
the same or another SST file. Especially with block_align=true, the
probability of exact block size match is probably around 1 in 100, making
the FN probability around that same. Without `block_align=true` the
probability of same block start location is probably around 1 in 10,000,
for FN probability around 1 in a million.
To solve this problem in new format_version=6, we add "context awareness"
to block checksum checks. The stored and expected checksum value is
modified based on the block's position in the file and which file it is in. The
modifications are cleverly chosen so that, for example
* blocks within about 4GB of each other are guaranteed to use different context
* blocks that are offset by exactly some multiple of 4GiB are guaranteed to use
different context
* files generated by the same process are guaranteed to use different context
for the same offsets, until wrap-around after 2^32 - 1 files
Thus, with format_version=6, if a valid SST block and checksum is misplaced,
its checksum FN probability should be essentially ideal, 1 in 4B.
## Footer checksum
This change also adds checksum protection to the SST footer (with
format_version=6), for the first time without relying on whole file checksum.
To prevent a corruption of the format_version in the footer (e.g. 6 -> 5) to
defeat the footer checksum, we change much of the footer data format
including an "extended magic number" in format_version 6 that would be
interpreted as empty index and metaindex block handles in older footer
versions. We also change the encoding of handles to free up space for
other new data in footer.
## More detail: making space in footer
In order to keep footer the same size in format_version=6 (avoid change to IO
patterns), we have to free up some space for new data. We do this two ways:
* Metaindex block handle is encoded down to 4 bytes (from 10) by assuming
it immediately precedes the footer, and by assuming it is < 4GB.
* Index block handle is moved into metaindex. (I don't know why it was
in footer to begin with.)
## Performance
In case of small performance penalty, I've made a "pay as you go" optimization
to compensate: replace `MutableCFOptions` in BlockBasedTableBuilder::Rep
with the only field used in that structure after construction: `prefix_extractor`.
This makes the PR an overall performance improvement (results below).
Nevertheless I'm seeing essentially no difference going from fv=5 to fv=6,
even including that improvement for both. That's based on extreme case table
write performance testing, many files with many blocks. This is relatively
checksum intensive (small blocks) and salt generation intensive (small files).
```
(for I in `seq 1 100`; do TEST_TMPDIR=/dev/shm/dbbench2 ./db_bench -benchmarks=fillseq -memtablerep=vector -disable_wal=1 -allow_concurrent_memtable_write=false -num=3000000 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=0 -write_buffer_size=100000 -compression_type=none -block_size=1000; done) 2>&1 | grep micros/op | tee out
awk '{ tot += $5; n += 1; } END { print int(1.0 * tot / n) }' < out
```
Each value below is ops/s averaged over 100 runs, run simultaneously with competing
configuration for load fairness
Before -> after (both fv=5): 483530 -> 483673 (negligible)
Re-run 1: 480733 -> 485427 (1.0% faster)
Re-run 2: 483821 -> 484541 (0.1% faster)
Before (fv=5) -> after (fv=6): 482006 -> 485100 (0.6% faster)
Re-run 1: 482212 -> 485075 (0.6% faster)
Re-run 2: 483590 -> 484073 (0.1% faster)
After fv=5 -> after fv=6: 483878 -> 485542 (0.3% faster)
Re-run 1: 485331 -> 483385 (0.4% slower)
Re-run 2: 485283 -> 483435 (0.4% slower)
Re-run 3: 483647 -> 486109 (0.5% faster)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9058
Test Plan:
unit tests included (table_test, db_properties_test, salt in env_test). General DB tests
and crash test updated to test new format_version.
Also temporarily updated the default format version to 6 and saw some test failures. Almost all
were due to an inadvertent additional read in VerifyChecksum to verify the index block checksum,
though it's arguably a bug that VerifyChecksum does not appear to (re-)verify the index block
checksum, just assuming it was verified in opening the index reader (probably *usually* true but
probably not always true). Some other concerns about VerifyChecksum are left in FIXME
comments. The only remaining test failure on change of default (in block_fetcher_test) now
has a comment about how to upgrade the test.
The format compatibility test does not need updating because we have not updated the default
format_version.
Reviewed By: ajkr, mrambacher
Differential Revision: D33100915
Pulled By: pdillinger
fbshipit-source-id: 8679e3e572fa580181a737fd6d113ed53c5422ee
2023-07-30 23:40:01 +00:00
|
|
|
|
|
|
|
if (FormatVersionUsesContextChecksum(fv)) {
|
|
|
|
ASSERT_EQ(decoded_footer.base_context_checksum(),
|
|
|
|
base_context_checksum);
|
|
|
|
|
|
|
|
// Bad offset should fail footer checksum
|
|
|
|
decoded_footer = Footer();
|
|
|
|
ASSERT_NOK(
|
|
|
|
decoded_footer.DecodeFrom(footer.GetSlice(), footer_offset - 1));
|
|
|
|
} else {
|
|
|
|
ASSERT_EQ(decoded_footer.base_context_checksum(), 0U);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Too big metaindex size should also fail encoding only in new footer
|
|
|
|
uint64_t big_metaindex_size = 0x100000007U;
|
|
|
|
uint64_t big_footer_offset =
|
|
|
|
data_size + big_metaindex_size + index_size + 3 * 5;
|
|
|
|
BlockHandle big_metaindex =
|
|
|
|
BlockHandle(data_size + index_size + 2 * 5, big_metaindex_size);
|
|
|
|
ASSERT_NE(footer
|
|
|
|
.Build(kBlockBasedTableMagicNumber, fv, big_footer_offset,
|
|
|
|
t, big_metaindex, index, maybe_bcc)
|
|
|
|
.ok(),
|
|
|
|
FormatVersionUsesContextChecksum(fv));
|
2021-12-10 16:12:09 +00:00
|
|
|
}
|
2018-11-01 22:39:40 +00:00
|
|
|
}
|
2023-01-27 21:14:19 +00:00
|
|
|
|
2014-05-01 18:09:32 +00:00
|
|
|
{
|
2021-12-14 01:42:05 +00:00
|
|
|
// legacy plain table
|
|
|
|
FooterBuilder footer;
|
format_version=6 and context-aware block checksums (#9058)
Summary:
## Context checksum
All RocksDB checksums currently use 32 bits of checking
power, which should be 1 in 4 billion false negative (FN) probability (failing to
detect corruption). This is true for random corruptions, and in some cases
small corruptions are guaranteed to be detected. But some possible
corruptions, such as in storage metadata rather than storage payload data,
would have a much higher FN rate. For example:
* Data larger than one SST block is replaced by data from elsewhere in
the same or another SST file. Especially with block_align=true, the
probability of exact block size match is probably around 1 in 100, making
the FN probability around that same. Without `block_align=true` the
probability of same block start location is probably around 1 in 10,000,
for FN probability around 1 in a million.
To solve this problem in new format_version=6, we add "context awareness"
to block checksum checks. The stored and expected checksum value is
modified based on the block's position in the file and which file it is in. The
modifications are cleverly chosen so that, for example
* blocks within about 4GB of each other are guaranteed to use different context
* blocks that are offset by exactly some multiple of 4GiB are guaranteed to use
different context
* files generated by the same process are guaranteed to use different context
for the same offsets, until wrap-around after 2^32 - 1 files
Thus, with format_version=6, if a valid SST block and checksum is misplaced,
its checksum FN probability should be essentially ideal, 1 in 4B.
## Footer checksum
This change also adds checksum protection to the SST footer (with
format_version=6), for the first time without relying on whole file checksum.
To prevent a corruption of the format_version in the footer (e.g. 6 -> 5) to
defeat the footer checksum, we change much of the footer data format
including an "extended magic number" in format_version 6 that would be
interpreted as empty index and metaindex block handles in older footer
versions. We also change the encoding of handles to free up space for
other new data in footer.
## More detail: making space in footer
In order to keep footer the same size in format_version=6 (avoid change to IO
patterns), we have to free up some space for new data. We do this two ways:
* Metaindex block handle is encoded down to 4 bytes (from 10) by assuming
it immediately precedes the footer, and by assuming it is < 4GB.
* Index block handle is moved into metaindex. (I don't know why it was
in footer to begin with.)
## Performance
In case of small performance penalty, I've made a "pay as you go" optimization
to compensate: replace `MutableCFOptions` in BlockBasedTableBuilder::Rep
with the only field used in that structure after construction: `prefix_extractor`.
This makes the PR an overall performance improvement (results below).
Nevertheless I'm seeing essentially no difference going from fv=5 to fv=6,
even including that improvement for both. That's based on extreme case table
write performance testing, many files with many blocks. This is relatively
checksum intensive (small blocks) and salt generation intensive (small files).
```
(for I in `seq 1 100`; do TEST_TMPDIR=/dev/shm/dbbench2 ./db_bench -benchmarks=fillseq -memtablerep=vector -disable_wal=1 -allow_concurrent_memtable_write=false -num=3000000 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=0 -write_buffer_size=100000 -compression_type=none -block_size=1000; done) 2>&1 | grep micros/op | tee out
awk '{ tot += $5; n += 1; } END { print int(1.0 * tot / n) }' < out
```
Each value below is ops/s averaged over 100 runs, run simultaneously with competing
configuration for load fairness
Before -> after (both fv=5): 483530 -> 483673 (negligible)
Re-run 1: 480733 -> 485427 (1.0% faster)
Re-run 2: 483821 -> 484541 (0.1% faster)
Before (fv=5) -> after (fv=6): 482006 -> 485100 (0.6% faster)
Re-run 1: 482212 -> 485075 (0.6% faster)
Re-run 2: 483590 -> 484073 (0.1% faster)
After fv=5 -> after fv=6: 483878 -> 485542 (0.3% faster)
Re-run 1: 485331 -> 483385 (0.4% slower)
Re-run 2: 485283 -> 483435 (0.4% slower)
Re-run 3: 483647 -> 486109 (0.5% faster)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9058
Test Plan:
unit tests included (table_test, db_properties_test, salt in env_test). General DB tests
and crash test updated to test new format_version.
Also temporarily updated the default format version to 6 and saw some test failures. Almost all
were due to an inadvertent additional read in VerifyChecksum to verify the index block checksum,
though it's arguably a bug that VerifyChecksum does not appear to (re-)verify the index block
checksum, just assuming it was verified in opening the index reader (probably *usually* true but
probably not always true). Some other concerns about VerifyChecksum are left in FIXME
comments. The only remaining test failure on change of default (in block_fetcher_test) now
has a comment about how to upgrade the test.
The format compatibility test does not need updating because we have not updated the default
format_version.
Reviewed By: ajkr, mrambacher
Differential Revision: D33100915
Pulled By: pdillinger
fbshipit-source-id: 8679e3e572fa580181a737fd6d113ed53c5422ee
2023-07-30 23:40:01 +00:00
|
|
|
ASSERT_OK(footer.Build(kPlainTableMagicNumber, /* format_version */ 0,
|
|
|
|
footer_offset, kNoChecksum, meta_index));
|
2014-05-01 18:09:32 +00:00
|
|
|
Footer decoded_footer;
|
2021-12-14 01:42:05 +00:00
|
|
|
ASSERT_OK(decoded_footer.DecodeFrom(footer.GetSlice(), footer_offset));
|
2014-05-01 18:09:32 +00:00
|
|
|
ASSERT_EQ(decoded_footer.table_magic_number(), kPlainTableMagicNumber);
|
2021-12-10 16:12:09 +00:00
|
|
|
ASSERT_EQ(decoded_footer.checksum_type(), kCRC32c);
|
2014-05-01 18:09:32 +00:00
|
|
|
ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset());
|
|
|
|
ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size());
|
2021-12-14 01:42:05 +00:00
|
|
|
ASSERT_EQ(decoded_footer.index_handle().offset(), 0U);
|
|
|
|
ASSERT_EQ(decoded_footer.index_handle().size(), 0U);
|
2021-12-10 16:12:09 +00:00
|
|
|
ASSERT_EQ(decoded_footer.format_version(), 0U);
|
|
|
|
ASSERT_EQ(decoded_footer.GetBlockTrailerSize(), 0U);
|
2021-12-14 01:42:05 +00:00
|
|
|
// Ensure serialized with legacy magic
|
|
|
|
ASSERT_EQ(
|
|
|
|
DecodeFixed64(footer.GetSlice().data() + footer.GetSlice().size() - 8),
|
|
|
|
kLegacyPlainTableMagicNumber);
|
2014-05-01 18:09:32 +00:00
|
|
|
}
|
|
|
|
{
|
Implement XXH3 block checksum type (#9069)
Summary:
XXH3 - latest hash function that is extremely fast on large
data, easily faster than crc32c on most any x86_64 hardware. In
integrating this hash function, I have handled the compression type byte
in a non-standard way to avoid using the streaming API (extra data
movement and active code size because of hash function complexity). This
approach got a thumbs-up from Yann Collet.
Existing functionality change:
* reject bad ChecksumType in options with InvalidArgument
This change split off from https://github.com/facebook/rocksdb/issues/9058 because context-aware checksum is
likely to be handled through different configuration than ChecksumType.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9069
Test Plan:
tests updated, and substantially expanded. Unit tests now check
that we don't accidentally change the values generated by the checksum
algorithms ("schema test") and that we properly handle
invalid/unrecognized checksum types in options or in file footer.
DBTestBase::ChangeOptions (etc.) updated from two to one configuration
changing from default CRC32c ChecksumType. The point of this test code
is to detect possible interactions among features, and the likelihood of
some bad interaction being detected by including configurations other
than XXH3 and CRC32c--and then not detected by stress/crash test--is
extremely low.
Stress/crash test also updated (manual run long enough to see it accepts
new checksum type). db_bench also updated for microbenchmarking
checksums.
### Performance microbenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
./db_bench -benchmarks=crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3
crc32c : 0.200 micros/op 5005220 ops/sec; 19551.6 MB/s (4096 per op)
xxhash : 0.807 micros/op 1238408 ops/sec; 4837.5 MB/s (4096 per op)
xxhash64 : 0.421 micros/op 2376514 ops/sec; 9283.3 MB/s (4096 per op)
xxh3 : 0.171 micros/op 5858391 ops/sec; 22884.3 MB/s (4096 per op)
crc32c : 0.206 micros/op 4859566 ops/sec; 18982.7 MB/s (4096 per op)
xxhash : 0.793 micros/op 1260850 ops/sec; 4925.2 MB/s (4096 per op)
xxhash64 : 0.410 micros/op 2439182 ops/sec; 9528.1 MB/s (4096 per op)
xxh3 : 0.161 micros/op 6202872 ops/sec; 24230.0 MB/s (4096 per op)
crc32c : 0.203 micros/op 4924686 ops/sec; 19237.1 MB/s (4096 per op)
xxhash : 0.839 micros/op 1192388 ops/sec; 4657.8 MB/s (4096 per op)
xxhash64 : 0.424 micros/op 2357391 ops/sec; 9208.6 MB/s (4096 per op)
xxh3 : 0.162 micros/op 6182678 ops/sec; 24151.1 MB/s (4096 per op)
As you can see, especially once warmed up, xxh3 is fastest.
### Performance macrobenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
Test
for I in `seq 1 50`; do for CHK in 0 1 2 3 4; do TEST_TMPDIR=/dev/shm/rocksdb$CHK ./db_bench -benchmarks=fillseq -memtablerep=vector -allow_concurrent_memtable_write=false -num=30000000 -checksum_type=$CHK 2>&1 | grep 'micros/op' | tee -a results-$CHK & done; wait; done
Results (ops/sec)
for FILE in results*; do echo -n "$FILE "; awk '{ s += $5; c++; } END { print 1.0 * s / c; }' < $FILE; done
results-0 252118 # kNoChecksum
results-1 251588 # kCRC32c
results-2 251863 # kxxHash
results-3 252016 # kxxHash64
results-4 252038 # kXXH3
Reviewed By: mrambacher
Differential Revision: D31905249
Pulled By: pdillinger
fbshipit-source-id: cb9b998ebe2523fc7c400eedf62124a78bf4b4d1
2021-10-29 05:13:47 +00:00
|
|
|
// xxhash plain table (not currently used)
|
2021-12-14 01:42:05 +00:00
|
|
|
FooterBuilder footer;
|
format_version=6 and context-aware block checksums (#9058)
Summary:
## Context checksum
All RocksDB checksums currently use 32 bits of checking
power, which should be 1 in 4 billion false negative (FN) probability (failing to
detect corruption). This is true for random corruptions, and in some cases
small corruptions are guaranteed to be detected. But some possible
corruptions, such as in storage metadata rather than storage payload data,
would have a much higher FN rate. For example:
* Data larger than one SST block is replaced by data from elsewhere in
the same or another SST file. Especially with block_align=true, the
probability of exact block size match is probably around 1 in 100, making
the FN probability around that same. Without `block_align=true` the
probability of same block start location is probably around 1 in 10,000,
for FN probability around 1 in a million.
To solve this problem in new format_version=6, we add "context awareness"
to block checksum checks. The stored and expected checksum value is
modified based on the block's position in the file and which file it is in. The
modifications are cleverly chosen so that, for example
* blocks within about 4GB of each other are guaranteed to use different context
* blocks that are offset by exactly some multiple of 4GiB are guaranteed to use
different context
* files generated by the same process are guaranteed to use different context
for the same offsets, until wrap-around after 2^32 - 1 files
Thus, with format_version=6, if a valid SST block and checksum is misplaced,
its checksum FN probability should be essentially ideal, 1 in 4B.
## Footer checksum
This change also adds checksum protection to the SST footer (with
format_version=6), for the first time without relying on whole file checksum.
To prevent a corruption of the format_version in the footer (e.g. 6 -> 5) to
defeat the footer checksum, we change much of the footer data format
including an "extended magic number" in format_version 6 that would be
interpreted as empty index and metaindex block handles in older footer
versions. We also change the encoding of handles to free up space for
other new data in footer.
## More detail: making space in footer
In order to keep footer the same size in format_version=6 (avoid change to IO
patterns), we have to free up some space for new data. We do this two ways:
* Metaindex block handle is encoded down to 4 bytes (from 10) by assuming
it immediately precedes the footer, and by assuming it is < 4GB.
* Index block handle is moved into metaindex. (I don't know why it was
in footer to begin with.)
## Performance
In case of small performance penalty, I've made a "pay as you go" optimization
to compensate: replace `MutableCFOptions` in BlockBasedTableBuilder::Rep
with the only field used in that structure after construction: `prefix_extractor`.
This makes the PR an overall performance improvement (results below).
Nevertheless I'm seeing essentially no difference going from fv=5 to fv=6,
even including that improvement for both. That's based on extreme case table
write performance testing, many files with many blocks. This is relatively
checksum intensive (small blocks) and salt generation intensive (small files).
```
(for I in `seq 1 100`; do TEST_TMPDIR=/dev/shm/dbbench2 ./db_bench -benchmarks=fillseq -memtablerep=vector -disable_wal=1 -allow_concurrent_memtable_write=false -num=3000000 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=0 -write_buffer_size=100000 -compression_type=none -block_size=1000; done) 2>&1 | grep micros/op | tee out
awk '{ tot += $5; n += 1; } END { print int(1.0 * tot / n) }' < out
```
Each value below is ops/s averaged over 100 runs, run simultaneously with competing
configuration for load fairness
Before -> after (both fv=5): 483530 -> 483673 (negligible)
Re-run 1: 480733 -> 485427 (1.0% faster)
Re-run 2: 483821 -> 484541 (0.1% faster)
Before (fv=5) -> after (fv=6): 482006 -> 485100 (0.6% faster)
Re-run 1: 482212 -> 485075 (0.6% faster)
Re-run 2: 483590 -> 484073 (0.1% faster)
After fv=5 -> after fv=6: 483878 -> 485542 (0.3% faster)
Re-run 1: 485331 -> 483385 (0.4% slower)
Re-run 2: 485283 -> 483435 (0.4% slower)
Re-run 3: 483647 -> 486109 (0.5% faster)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9058
Test Plan:
unit tests included (table_test, db_properties_test, salt in env_test). General DB tests
and crash test updated to test new format_version.
Also temporarily updated the default format version to 6 and saw some test failures. Almost all
were due to an inadvertent additional read in VerifyChecksum to verify the index block checksum,
though it's arguably a bug that VerifyChecksum does not appear to (re-)verify the index block
checksum, just assuming it was verified in opening the index reader (probably *usually* true but
probably not always true). Some other concerns about VerifyChecksum are left in FIXME
comments. The only remaining test failure on change of default (in block_fetcher_test) now
has a comment about how to upgrade the test.
The format compatibility test does not need updating because we have not updated the default
format_version.
Reviewed By: ajkr, mrambacher
Differential Revision: D33100915
Pulled By: pdillinger
fbshipit-source-id: 8679e3e572fa580181a737fd6d113ed53c5422ee
2023-07-30 23:40:01 +00:00
|
|
|
ASSERT_OK(footer.Build(kPlainTableMagicNumber, /* format_version */ 1,
|
|
|
|
footer_offset, kxxHash, meta_index));
|
2014-05-01 18:09:32 +00:00
|
|
|
Footer decoded_footer;
|
2021-12-14 01:42:05 +00:00
|
|
|
ASSERT_OK(decoded_footer.DecodeFrom(footer.GetSlice(), footer_offset));
|
2014-05-01 18:09:32 +00:00
|
|
|
ASSERT_EQ(decoded_footer.table_magic_number(), kPlainTableMagicNumber);
|
2021-12-10 16:12:09 +00:00
|
|
|
ASSERT_EQ(decoded_footer.checksum_type(), kxxHash);
|
2014-05-01 18:09:32 +00:00
|
|
|
ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset());
|
|
|
|
ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size());
|
2021-12-14 01:42:05 +00:00
|
|
|
ASSERT_EQ(decoded_footer.index_handle().offset(), 0U);
|
|
|
|
ASSERT_EQ(decoded_footer.index_handle().size(), 0U);
|
2021-12-10 16:12:09 +00:00
|
|
|
ASSERT_EQ(decoded_footer.format_version(), 1U);
|
|
|
|
ASSERT_EQ(decoded_footer.GetBlockTrailerSize(), 0U);
|
2015-01-13 22:33:04 +00:00
|
|
|
}
|
2014-05-01 18:09:32 +00:00
|
|
|
}
|
|
|
|
|
2016-02-05 18:22:37 +00:00
|
|
|
class IndexBlockRestartIntervalTest
|
2018-06-05 02:59:44 +00:00
|
|
|
: public TableTest,
|
2018-08-09 23:49:45 +00:00
|
|
|
public ::testing::WithParamInterface<std::pair<int, bool>> {
|
2016-02-05 18:22:37 +00:00
|
|
|
public:
|
2018-08-09 23:49:45 +00:00
|
|
|
static std::vector<std::pair<int, bool>> GetRestartValues() {
|
|
|
|
return {{-1, false}, {0, false}, {1, false}, {8, false},
|
|
|
|
{16, false}, {32, false}, {-1, true}, {0, true},
|
|
|
|
{1, true}, {8, true}, {16, true}, {32, true}};
|
|
|
|
}
|
2016-02-05 18:22:37 +00:00
|
|
|
};
|
|
|
|
|
2020-06-03 22:53:09 +00:00
|
|
|
INSTANTIATE_TEST_CASE_P(
|
2016-02-05 18:22:37 +00:00
|
|
|
IndexBlockRestartIntervalTest, IndexBlockRestartIntervalTest,
|
|
|
|
::testing::ValuesIn(IndexBlockRestartIntervalTest::GetRestartValues()));
|
|
|
|
|
|
|
|
TEST_P(IndexBlockRestartIntervalTest, IndexBlockRestartInterval) {
|
|
|
|
const int kKeysInTable = 10000;
|
|
|
|
const int kKeySize = 100;
|
|
|
|
const int kValSize = 500;
|
|
|
|
|
2018-08-09 23:49:45 +00:00
|
|
|
const int index_block_restart_interval = std::get<0>(GetParam());
|
|
|
|
const bool value_delta_encoding = std::get<1>(GetParam());
|
2016-02-05 18:22:37 +00:00
|
|
|
|
|
|
|
Options options;
|
|
|
|
BlockBasedTableOptions table_options;
|
|
|
|
table_options.block_size = 64; // small block size to get big index block
|
|
|
|
table_options.index_block_restart_interval = index_block_restart_interval;
|
2018-08-09 23:49:45 +00:00
|
|
|
if (value_delta_encoding) {
|
|
|
|
table_options.format_version = 4;
|
2021-03-09 20:41:15 +00:00
|
|
|
} else {
|
|
|
|
table_options.format_version = 3;
|
2018-08-09 23:49:45 +00:00
|
|
|
}
|
2016-02-05 18:22:37 +00:00
|
|
|
options.table_factory.reset(new BlockBasedTableFactory(table_options));
|
|
|
|
|
|
|
|
TableConstructor c(BytewiseComparator());
|
|
|
|
static Random rnd(301);
|
|
|
|
for (int i = 0; i < kKeysInTable; i++) {
|
2020-07-09 21:33:42 +00:00
|
|
|
InternalKey k(rnd.RandomString(kKeySize), 0, kTypeValue);
|
|
|
|
c.Add(k.Encode().ToString(), rnd.RandomString(kValSize));
|
2016-02-05 18:22:37 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<std::string> keys;
|
|
|
|
stl_wrappers::KVMap kvmap;
|
|
|
|
std::unique_ptr<InternalKeyComparator> comparator(
|
|
|
|
new InternalKeyComparator(BytewiseComparator()));
|
2021-05-05 20:59:21 +00:00
|
|
|
const ImmutableOptions ioptions(options);
|
2018-05-21 21:33:55 +00:00
|
|
|
const MutableCFOptions moptions(options);
|
|
|
|
c.Finish(options, ioptions, moptions, table_options, *comparator, &keys,
|
|
|
|
&kvmap);
|
2016-02-05 18:22:37 +00:00
|
|
|
auto reader = c.GetTableReader();
|
|
|
|
|
2020-08-03 22:21:56 +00:00
|
|
|
ReadOptions read_options;
|
2019-06-20 21:28:22 +00:00
|
|
|
std::unique_ptr<InternalIterator> db_iter(reader->NewIterator(
|
2020-08-03 22:21:56 +00:00
|
|
|
read_options, moptions.prefix_extractor.get(), /*arena=*/nullptr,
|
2019-06-20 21:28:22 +00:00
|
|
|
/*skip_filters=*/false, TableReaderCaller::kUncategorized));
|
2016-02-05 18:22:37 +00:00
|
|
|
|
|
|
|
// Test point lookup
|
|
|
|
for (auto& kv : kvmap) {
|
|
|
|
db_iter->Seek(kv.first);
|
|
|
|
|
|
|
|
ASSERT_TRUE(db_iter->Valid());
|
|
|
|
ASSERT_OK(db_iter->status());
|
|
|
|
ASSERT_EQ(db_iter->key(), kv.first);
|
|
|
|
ASSERT_EQ(db_iter->value(), kv.second);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Test iterating
|
|
|
|
auto kv_iter = kvmap.begin();
|
|
|
|
for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) {
|
|
|
|
ASSERT_EQ(db_iter->key(), kv_iter->first);
|
|
|
|
ASSERT_EQ(db_iter->value(), kv_iter->second);
|
|
|
|
kv_iter++;
|
|
|
|
}
|
|
|
|
ASSERT_EQ(kv_iter, kvmap.end());
|
Adding pin_l0_filter_and_index_blocks_in_cache feature and related fixes.
Summary:
When a block based table file is opened, if prefetch_index_and_filter is true, it will prefetch the index and filter blocks, putting them into the block cache.
What this feature adds: when a L0 block based table file is opened, if pin_l0_filter_and_index_blocks_in_cache is true in the options (and prefetch_index_and_filter is true), then the filter and index blocks aren't released back to the block cache at the end of BlockBasedTableReader::Open(). Instead the table reader takes ownership of them, hence pinning them, ie. the LRU cache will never push them out. Meanwhile in the table reader, further accesses will not hit the block cache, thus avoiding lock contention.
Test Plan:
'export TEST_TMPDIR=/dev/shm/ && DISABLE_JEMALLOC=1 OPT=-g make all valgrind_check -j32' is OK.
I didn't run the Java tests, I don't have Java set up on my devserver.
Reviewers: sdong
Reviewed By: sdong
Subscribers: andrewkr, dhruba
Differential Revision: https://reviews.facebook.net/D56133
2016-04-01 17:42:39 +00:00
|
|
|
c.ResetTableReader();
|
2016-02-05 18:22:37 +00:00
|
|
|
}
|
|
|
|
|
2016-02-23 00:33:26 +00:00
|
|
|
class PrefixTest : public testing::Test {
|
|
|
|
public:
|
|
|
|
PrefixTest() : testing::Test() {}
|
2023-12-01 19:15:17 +00:00
|
|
|
~PrefixTest() override = default;
|
2016-02-23 00:33:26 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
// A simple PrefixExtractor that only works for test PrefixAndWholeKeyTest
|
2020-02-20 20:07:53 +00:00
|
|
|
class TestPrefixExtractor : public ROCKSDB_NAMESPACE::SliceTransform {
|
2016-02-23 00:33:26 +00:00
|
|
|
public:
|
2023-12-01 19:15:17 +00:00
|
|
|
~TestPrefixExtractor() override = default;
|
|
|
|
;
|
2016-02-23 00:33:26 +00:00
|
|
|
const char* Name() const override { return "TestPrefixExtractor"; }
|
|
|
|
|
2020-02-20 20:07:53 +00:00
|
|
|
ROCKSDB_NAMESPACE::Slice Transform(
|
|
|
|
const ROCKSDB_NAMESPACE::Slice& src) const override {
|
2016-02-23 00:33:26 +00:00
|
|
|
assert(IsValid(src));
|
2020-02-20 20:07:53 +00:00
|
|
|
return ROCKSDB_NAMESPACE::Slice(src.data(), 3);
|
2016-02-23 00:33:26 +00:00
|
|
|
}
|
|
|
|
|
2020-02-20 20:07:53 +00:00
|
|
|
bool InDomain(const ROCKSDB_NAMESPACE::Slice& src) const override {
|
Fix a bug for SeekForPrev with partitioned filter and prefix (#8137)
Summary:
According to https://github.com/facebook/rocksdb/issues/5907, each filter partition "should include the bloom of the prefix of the last
key in the previous partition" so that SeekForPrev() in prefix mode can return correct result.
The prefix of the last key in the previous partition does not necessarily have the same prefix
as the first key in the current partition. Regardless of the first key in current partition, the
prefix of the last key in the previous partition should be added. The existing code, however,
does not follow this. Furthermore, there is another issue: when finishing current filter partition,
`FullFilterBlockBuilder::AddPrefix()` is called for the first key in next filter partition, which effectively
overwrites `last_prefix_str_` prematurely. Consequently, when the filter block builder proceeds
to the next partition, `last_prefix_str_` will be the prefix of its first key, leaving no way of adding
the bloom of the prefix of the last key of the previous partition.
Prefix extractor is FixedLength.2.
```
[ filter part 1 ] [ filter part 2 ]
abc d
```
When SeekForPrev("abcd"), checking the filter partition will land on filter part 2 because "abcd" > "abc"
but smaller than "d".
If the filter in filter part 2 happens to return false for the test for "ab", then SeekForPrev("abcd") will build
incorrect iterator tree in non-total-order mode.
Also fix a unit test which starts to fail following this PR. `InDomain` should not fail due to assertion
error when checking on an arbitrary key.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8137
Test Plan:
```
make check
```
Without this fix, the following command will fail pretty soon.
```
./db_stress --acquire_snapshot_one_in=10000 --avoid_flush_during_recovery=0 \
--avoid_unnecessary_blocking_io=0 --backup_max_size=104857600 --backup_one_in=0 \
--batch_protection_bytes_per_key=0 --block_size=16384 --bloom_bits=17 \
--bottommost_compression_type=disable --cache_index_and_filter_blocks=1 --cache_size=1048576 \
--checkpoint_one_in=0 --checksum_type=kxxHash64 --clear_column_family_one_in=0 \
--compact_files_one_in=1000000 --compact_range_one_in=1000000 --compaction_ttl=0 \
--compression_max_dict_buffer_bytes=0 --compression_max_dict_bytes=0 \
--compression_parallel_threads=1 --compression_type=zstd --compression_zstd_max_train_bytes=0 \
--continuous_verification_interval=0 --db=/dev/shm/rocksdb/rocksdb_crashtest_whitebox \
--db_write_buffer_size=8388608 --delpercent=5 --delrangepercent=0 --destroy_db_initially=0 --enable_blob_files=0 \
--enable_compaction_filter=0 --enable_pipelined_write=1 --file_checksum_impl=big --flush_one_in=1000000 \
--format_version=5 --get_current_wal_file_one_in=0 --get_live_files_one_in=1000000 --get_property_one_in=1000000 \
--get_sorted_wal_files_one_in=0 --index_block_restart_interval=4 --index_type=2 --ingest_external_file_one_in=0 \
--iterpercent=10 --key_len_percent_dist=1,30,69 --level_compaction_dynamic_level_bytes=True \
--log2_keys_per_lock=10 --long_running_snapshots=1 --mark_for_compaction_one_file_in=0 \
--max_background_compactions=20 --max_bytes_for_level_base=10485760 --max_key=100000000 --max_key_len=3 \
--max_manifest_file_size=1073741824 --max_write_batch_group_size_bytes=16777216 --max_write_buffer_number=3 \
--max_write_buffer_size_to_maintain=8388608 --memtablerep=skip_list --mmap_read=1 --mock_direct_io=False \
--nooverwritepercent=0 --open_files=500000 --ops_per_thread=20000000 --optimize_filters_for_memory=0 --paranoid_file_checks=1 --partition_filters=1 --partition_pinning=0 --pause_background_one_in=1000000 \
--periodic_compaction_seconds=0 --prefixpercent=5 --progress_reports=0 --read_fault_one_in=0 --read_only=0 \
--readpercent=45 --recycle_log_file_num=0 --reopen=20 --secondary_catch_up_one_in=0 \
--snapshot_hold_ops=100000 --sst_file_manager_bytes_per_sec=104857600 \
--sst_file_manager_bytes_per_truncate=0 --subcompactions=2 --sync=0 --sync_fault_injection=False \
--target_file_size_base=2097152 --target_file_size_multiplier=2 --test_batches_snapshots=0 --test_cf_consistency=0 \
--top_level_index_pinning=0 --unpartitioned_pinning=1 --use_blob_db=0 --use_block_based_filter=0 \
--use_direct_io_for_flush_and_compaction=0 --use_direct_reads=0 --use_full_merge_v1=0 --use_merge=0 \
--use_multiget=0 --use_ribbon_filter=0 --use_txn=0 --user_timestamp_size=8 --verify_checksum=1 \
--verify_checksum_one_in=1000000 --verify_db_one_in=100000 --write_buffer_size=4194304 \
--write_dbid_to_manifest=1 --writepercent=35
```
Reviewed By: pdillinger
Differential Revision: D27553054
Pulled By: riversand963
fbshipit-source-id: 60e391e4a2d8d98a9a3172ec5d6176b90ec3de98
2021-04-06 19:12:57 +00:00
|
|
|
return IsValid(src);
|
2016-02-23 00:33:26 +00:00
|
|
|
}
|
|
|
|
|
2020-02-20 20:07:53 +00:00
|
|
|
bool InRange(const ROCKSDB_NAMESPACE::Slice& /*dst*/) const override {
|
|
|
|
return true;
|
|
|
|
}
|
2016-02-23 00:33:26 +00:00
|
|
|
|
2020-02-20 20:07:53 +00:00
|
|
|
bool IsValid(const ROCKSDB_NAMESPACE::Slice& src) const {
|
2016-02-23 00:33:26 +00:00
|
|
|
if (src.size() != 4) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
if (src[0] != '[') {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
if (src[1] < '0' || src[1] > '9') {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
if (src[2] != ']') {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
if (src[3] < '0' || src[3] > '9') {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
} // namespace
|
|
|
|
|
|
|
|
TEST_F(PrefixTest, PrefixAndWholeKeyTest) {
|
2020-02-20 20:07:53 +00:00
|
|
|
ROCKSDB_NAMESPACE::Options options;
|
|
|
|
options.compaction_style = ROCKSDB_NAMESPACE::kCompactionStyleUniversal;
|
2016-02-23 00:33:26 +00:00
|
|
|
options.num_levels = 20;
|
|
|
|
options.create_if_missing = true;
|
|
|
|
options.optimize_filters_for_hits = false;
|
|
|
|
options.target_file_size_base = 268435456;
|
|
|
|
options.prefix_extractor = std::make_shared<TestPrefixExtractor>();
|
2020-02-20 20:07:53 +00:00
|
|
|
ROCKSDB_NAMESPACE::BlockBasedTableOptions bbto;
|
|
|
|
bbto.filter_policy.reset(ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10));
|
2016-02-23 00:33:26 +00:00
|
|
|
bbto.block_size = 262144;
|
|
|
|
bbto.whole_key_filtering = true;
|
|
|
|
|
2018-07-14 00:18:39 +00:00
|
|
|
const std::string kDBPath = test::PerThreadDBPath("table_prefix_test");
|
2016-02-23 00:33:26 +00:00
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(bbto));
|
2020-12-24 00:54:05 +00:00
|
|
|
ASSERT_OK(DestroyDB(kDBPath, options));
|
2020-02-20 20:07:53 +00:00
|
|
|
ROCKSDB_NAMESPACE::DB* db;
|
|
|
|
ASSERT_OK(ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &db));
|
2016-02-23 00:33:26 +00:00
|
|
|
|
|
|
|
// Create a bunch of keys with 10 filters.
|
|
|
|
for (int i = 0; i < 10; i++) {
|
|
|
|
std::string prefix = "[" + std::to_string(i) + "]";
|
|
|
|
for (int j = 0; j < 10; j++) {
|
|
|
|
std::string key = prefix + std::to_string(j);
|
2020-10-08 18:11:52 +00:00
|
|
|
ASSERT_OK(db->Put(ROCKSDB_NAMESPACE::WriteOptions(), key, "1"));
|
2016-02-23 00:33:26 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Trigger compaction.
|
2020-10-08 18:11:52 +00:00
|
|
|
ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
|
2016-02-23 00:33:26 +00:00
|
|
|
delete db;
|
|
|
|
// In the second round, turn whole_key_filtering off and expect
|
|
|
|
// rocksdb still works.
|
|
|
|
}
|
|
|
|
|
2018-07-27 23:00:26 +00:00
|
|
|
/*
|
|
|
|
* Disable TableWithGlobalSeqno since RocksDB does not store global_seqno in
|
|
|
|
* the SST file any more. Instead, RocksDB deduces global_seqno from the
|
|
|
|
* MANIFEST while reading from an SST. Therefore, it's not possible to test the
|
|
|
|
* functionality of global_seqno in a single, isolated unit test without the
|
|
|
|
* involvement of Version, VersionSet, etc.
|
|
|
|
*/
|
|
|
|
TEST_P(BlockBasedTableTest, DISABLED_TableWithGlobalSeqno) {
|
2018-06-05 02:59:44 +00:00
|
|
|
BlockBasedTableOptions bbto = GetBlockBasedTableOptions();
|
2016-10-18 23:59:37 +00:00
|
|
|
test::StringSink* sink = new test::StringSink();
|
2021-01-04 23:59:52 +00:00
|
|
|
std::unique_ptr<FSWritableFile> holder(sink);
|
|
|
|
std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
|
|
|
|
std::move(holder), "" /* don't care */, FileOptions()));
|
2016-10-18 23:59:37 +00:00
|
|
|
Options options;
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(bbto));
|
2021-05-05 20:59:21 +00:00
|
|
|
const ImmutableOptions ioptions(options);
|
2018-05-21 21:33:55 +00:00
|
|
|
const MutableCFOptions moptions(options);
|
2016-10-18 23:59:37 +00:00
|
|
|
InternalKeyComparator ikc(options.comparator);
|
2024-02-02 22:14:43 +00:00
|
|
|
InternalTblPropCollFactories internal_tbl_prop_coll_factories;
|
|
|
|
internal_tbl_prop_coll_factories.emplace_back(
|
2016-10-18 23:59:37 +00:00
|
|
|
new SstFileWriterPropertiesCollectorFactory(2 /* version */,
|
|
|
|
0 /* global_seqno*/));
|
|
|
|
std::string column_family_name;
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
const ReadOptions read_options;
|
|
|
|
const WriteOptions write_options;
|
2016-10-18 23:59:37 +00:00
|
|
|
std::unique_ptr<TableBuilder> builder(options.table_factory->NewTableBuilder(
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
TableBuilderOptions(ioptions, moptions, read_options, write_options, ikc,
|
2024-02-02 22:14:43 +00:00
|
|
|
&internal_tbl_prop_coll_factories, kNoCompression,
|
Add more LSM info to FilterBuildingContext (#8246)
Summary:
Add `num_levels`, `is_bottommost`, and table file creation
`reason` to `FilterBuildingContext`, in anticipation of more powerful
Bloom-like filter support.
To support this, added `is_bottommost` and `reason` to
`TableBuilderOptions`, which allowed removing `reason` parameter from
`rocksdb::BuildTable`.
I attempted to remove `skip_filters` from `TableBuilderOptions`, because
filter construction decisions should arise from options, not one-off
parameters. I could not completely remove it because the public API for
SstFileWriter takes a `skip_filters` parameter, and translating this
into an option change would mean awkwardly replacing the table_factory
if it is BlockBasedTableFactory with new filter_policy=nullptr option.
I marked this public skip_filters option as deprecated because of this
oddity. (skip_filters on the read side probably makes sense.)
At least `skip_filters` is now largely hidden for users of
`TableBuilderOptions` and is no longer used for implementing the
optimize_filters_for_hits option. Bringing the logic for that option
closer to handling of FilterBuildingContext makes it more obvious that
hese two are using the same notion of "bottommost." (Planned:
configuration options for Bloom-like filters that generalize
`optimize_filters_for_hits`)
Recommended follow-up: Try to get away from "bottommost level" naming of
things, which is inaccurate (see
VersionStorageInfo::RangeMightExistAfterSortedRun), and move to
"bottommost run" or just "bottommost."
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8246
Test Plan:
extended an existing unit test to exercise and check various
filter building contexts. Also, existing tests for
optimize_filters_for_hits validate some of the "bottommost" handling,
which is now closely connected to FilterBuildingContext::is_bottommost
through TableBuilderOptions::is_bottommost
Reviewed By: mrambacher
Differential Revision: D28099346
Pulled By: pdillinger
fbshipit-source-id: 2c1072e29c24d4ac404c761a7b7663292372600a
2021-04-30 20:49:24 +00:00
|
|
|
CompressionOptions(), kUnknownColumnFamily,
|
2024-11-01 17:08:35 +00:00
|
|
|
column_family_name, -1, kUnknownNewestKeyTime),
|
2016-10-18 23:59:37 +00:00
|
|
|
file_writer.get()));
|
|
|
|
|
|
|
|
for (char c = 'a'; c <= 'z'; ++c) {
|
|
|
|
std::string key(8, c);
|
|
|
|
std::string value = key;
|
|
|
|
InternalKey ik(key, 0, kTypeValue);
|
|
|
|
|
|
|
|
builder->Add(ik.Encode(), value);
|
|
|
|
}
|
|
|
|
ASSERT_OK(builder->Finish());
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
ASSERT_OK(file_writer->Flush(IOOptions()));
|
2016-10-18 23:59:37 +00:00
|
|
|
|
|
|
|
test::RandomRWStringSink ss_rw(sink);
|
|
|
|
uint32_t version;
|
|
|
|
uint64_t global_seqno;
|
|
|
|
uint64_t global_seqno_offset;
|
|
|
|
|
|
|
|
// Helper function to get version, global_seqno, global_seqno_offset
|
|
|
|
std::function<void()> GetVersionAndGlobalSeqno = [&]() {
|
2021-01-04 23:59:52 +00:00
|
|
|
std::unique_ptr<FSRandomAccessFile> source(
|
|
|
|
new test::StringSource(ss_rw.contents(), 73342, true));
|
2018-11-09 19:17:34 +00:00
|
|
|
std::unique_ptr<RandomAccessFileReader> file_reader(
|
2021-01-04 23:59:52 +00:00
|
|
|
new RandomAccessFileReader(std::move(source), ""));
|
2016-10-18 23:59:37 +00:00
|
|
|
|
Improve / clean up meta block code & integrity (#9163)
Summary:
* Checksums are now checked on meta blocks unless specifically
suppressed or not applicable (e.g. plain table). (Was other way around.)
This means a number of cases that were not checking checksums now are,
including direct read TableProperties in Version::GetTableProperties
(fixed in meta_blocks ReadTableProperties), reading any block from
PersistentCache (fixed in BlockFetcher), read TableProperties in
SstFileDumper (ldb/sst_dump/BackupEngine) before table reader open,
maybe more.
* For that to work, I moved the global_seqno+TableProperties checksum
logic to the shared table/ code, because that is used by many utilies
such as SstFileDumper.
* Also for that to work, we have to know when we're dealing with a block
that has a checksum (trailer), so added that capability to Footer based
on magic number, and from there BlockFetcher.
* Knowledge of trailer presence has also fixed a problem where other
table formats were reading blocks including bytes for a non-existant
trailer--and awkwardly kind-of not using them, e.g. no shared code
checking checksums. (BlockFetcher compression type was populated
incorrectly.) Now we only read what is needed.
* Minimized code duplication and differing/incompatible/awkward
abstractions in meta_blocks.{cc,h} (e.g. SeekTo in metaindex block
without parsing block handle)
* Moved some meta block handling code from table_properties*.*
* Moved some code specific to block-based table from shared table/ code
to BlockBasedTable class. The checksum stuff means we can't completely
separate it, but things that don't need to be in shared table/ code
should not be.
* Use unique_ptr rather than raw ptr in more places. (Note: you can
std::move from unique_ptr to shared_ptr.)
Without enhancements to GetPropertiesOfAllTablesTest (see below),
net reduction of roughly 100 lines of code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9163
Test Plan:
existing tests and
* Enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to verify that
checksums are now checked on direct read of table properties by TableCache
(new test would fail before this change)
* Also enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to test
putting table properties under old meta name
* Also generally enhanced that same test to actually test what it was
supposed to be testing already, by kicking things out of table cache when
we don't want them there.
Reviewed By: ajkr, mrambacher
Differential Revision: D32514757
Pulled By: pdillinger
fbshipit-source-id: 507964b9311d186ae8d1131182290cbd97a99fa9
2021-11-18 19:42:12 +00:00
|
|
|
std::unique_ptr<TableProperties> props;
|
2016-10-18 23:59:37 +00:00
|
|
|
ASSERT_OK(ReadTableProperties(file_reader.get(), ss_rw.contents().size(),
|
|
|
|
kBlockBasedTableMagicNumber, ioptions,
|
2023-04-21 16:07:18 +00:00
|
|
|
read_options, &props));
|
2016-10-18 23:59:37 +00:00
|
|
|
|
|
|
|
UserCollectedProperties user_props = props->user_collected_properties;
|
|
|
|
version = DecodeFixed32(
|
|
|
|
user_props[ExternalSstFilePropertyNames::kVersion].c_str());
|
|
|
|
global_seqno = DecodeFixed64(
|
|
|
|
user_props[ExternalSstFilePropertyNames::kGlobalSeqno].c_str());
|
2021-12-02 16:29:12 +00:00
|
|
|
global_seqno_offset = props->external_sst_file_global_seqno_offset;
|
2016-10-18 23:59:37 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
// Helper function to update the value of the global seqno in the file
|
|
|
|
std::function<void(uint64_t)> SetGlobalSeqno = [&](uint64_t val) {
|
|
|
|
std::string new_global_seqno;
|
|
|
|
PutFixed64(&new_global_seqno, val);
|
|
|
|
|
2021-01-04 23:59:52 +00:00
|
|
|
ASSERT_OK(ss_rw.Write(global_seqno_offset, new_global_seqno, IOOptions(),
|
|
|
|
nullptr));
|
2016-10-18 23:59:37 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
// Helper function to get the contents of the table InternalIterator
|
2018-11-09 19:17:34 +00:00
|
|
|
std::unique_ptr<TableReader> table_reader;
|
2016-10-18 23:59:37 +00:00
|
|
|
std::function<InternalIterator*()> GetTableInternalIter = [&]() {
|
2021-01-04 23:59:52 +00:00
|
|
|
std::unique_ptr<FSRandomAccessFile> source(
|
|
|
|
new test::StringSource(ss_rw.contents(), 73342, true));
|
2018-11-09 19:17:34 +00:00
|
|
|
std::unique_ptr<RandomAccessFileReader> file_reader(
|
2021-01-04 23:59:52 +00:00
|
|
|
new RandomAccessFileReader(std::move(source), ""));
|
2016-10-18 23:59:37 +00:00
|
|
|
|
|
|
|
options.table_factory->NewTableReader(
|
2022-01-21 19:36:36 +00:00
|
|
|
TableReaderOptions(ioptions, moptions.prefix_extractor, EnvOptions(),
|
2023-04-25 19:08:23 +00:00
|
|
|
ikc, 0 /* block_protection_bytes_per_key */),
|
2018-05-21 21:33:55 +00:00
|
|
|
std::move(file_reader), ss_rw.contents().size(), &table_reader);
|
2016-10-18 23:59:37 +00:00
|
|
|
|
2019-06-20 21:28:22 +00:00
|
|
|
return table_reader->NewIterator(
|
2020-08-03 22:21:56 +00:00
|
|
|
read_options, moptions.prefix_extractor.get(), /*arena=*/nullptr,
|
2019-06-20 21:28:22 +00:00
|
|
|
/*skip_filters=*/false, TableReaderCaller::kUncategorized);
|
2016-10-18 23:59:37 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
GetVersionAndGlobalSeqno();
|
2019-09-09 18:22:28 +00:00
|
|
|
ASSERT_EQ(2u, version);
|
|
|
|
ASSERT_EQ(0u, global_seqno);
|
2016-10-18 23:59:37 +00:00
|
|
|
|
|
|
|
InternalIterator* iter = GetTableInternalIter();
|
|
|
|
char current_c = 'a';
|
|
|
|
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
|
|
|
ParsedInternalKey pik;
|
2020-10-28 17:11:13 +00:00
|
|
|
ASSERT_OK(ParseInternalKey(iter->key(), &pik, true /* log_err_key */));
|
2016-10-18 23:59:37 +00:00
|
|
|
|
|
|
|
ASSERT_EQ(pik.type, ValueType::kTypeValue);
|
|
|
|
ASSERT_EQ(pik.sequence, 0);
|
|
|
|
ASSERT_EQ(pik.user_key, iter->value());
|
|
|
|
ASSERT_EQ(pik.user_key.ToString(), std::string(8, current_c));
|
|
|
|
current_c++;
|
|
|
|
}
|
|
|
|
ASSERT_EQ(current_c, 'z' + 1);
|
|
|
|
delete iter;
|
|
|
|
|
|
|
|
// Update global sequence number to 10
|
|
|
|
SetGlobalSeqno(10);
|
|
|
|
GetVersionAndGlobalSeqno();
|
2019-09-09 18:22:28 +00:00
|
|
|
ASSERT_EQ(2u, version);
|
|
|
|
ASSERT_EQ(10u, global_seqno);
|
2016-10-18 23:59:37 +00:00
|
|
|
|
|
|
|
iter = GetTableInternalIter();
|
|
|
|
current_c = 'a';
|
|
|
|
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
|
|
|
ParsedInternalKey pik;
|
2020-10-28 17:11:13 +00:00
|
|
|
ASSERT_OK(ParseInternalKey(iter->key(), &pik, true /* log_err_key */));
|
2016-10-18 23:59:37 +00:00
|
|
|
|
|
|
|
ASSERT_EQ(pik.type, ValueType::kTypeValue);
|
|
|
|
ASSERT_EQ(pik.sequence, 10);
|
|
|
|
ASSERT_EQ(pik.user_key, iter->value());
|
|
|
|
ASSERT_EQ(pik.user_key.ToString(), std::string(8, current_c));
|
|
|
|
current_c++;
|
|
|
|
}
|
|
|
|
ASSERT_EQ(current_c, 'z' + 1);
|
|
|
|
|
|
|
|
// Verify Seek
|
|
|
|
for (char c = 'a'; c <= 'z'; c++) {
|
|
|
|
std::string k = std::string(8, c);
|
|
|
|
InternalKey ik(k, 10, kValueTypeForSeek);
|
|
|
|
iter->Seek(ik.Encode());
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
|
|
|
|
ParsedInternalKey pik;
|
2020-10-28 17:11:13 +00:00
|
|
|
ASSERT_OK(ParseInternalKey(iter->key(), &pik, true /* log_err_key */));
|
2016-10-18 23:59:37 +00:00
|
|
|
|
|
|
|
ASSERT_EQ(pik.type, ValueType::kTypeValue);
|
|
|
|
ASSERT_EQ(pik.sequence, 10);
|
|
|
|
ASSERT_EQ(pik.user_key.ToString(), k);
|
|
|
|
ASSERT_EQ(iter->value().ToString(), k);
|
|
|
|
}
|
|
|
|
delete iter;
|
|
|
|
|
|
|
|
// Update global sequence number to 3
|
|
|
|
SetGlobalSeqno(3);
|
|
|
|
GetVersionAndGlobalSeqno();
|
2019-09-09 18:22:28 +00:00
|
|
|
ASSERT_EQ(2u, version);
|
|
|
|
ASSERT_EQ(3u, global_seqno);
|
2016-10-18 23:59:37 +00:00
|
|
|
|
|
|
|
iter = GetTableInternalIter();
|
|
|
|
current_c = 'a';
|
|
|
|
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
|
|
|
ParsedInternalKey pik;
|
2020-10-28 17:11:13 +00:00
|
|
|
ASSERT_OK(ParseInternalKey(iter->key(), &pik, true /* log_err_key */));
|
2016-10-18 23:59:37 +00:00
|
|
|
|
|
|
|
ASSERT_EQ(pik.type, ValueType::kTypeValue);
|
|
|
|
ASSERT_EQ(pik.sequence, 3);
|
|
|
|
ASSERT_EQ(pik.user_key, iter->value());
|
|
|
|
ASSERT_EQ(pik.user_key.ToString(), std::string(8, current_c));
|
|
|
|
current_c++;
|
|
|
|
}
|
|
|
|
ASSERT_EQ(current_c, 'z' + 1);
|
|
|
|
|
|
|
|
// Verify Seek
|
|
|
|
for (char c = 'a'; c <= 'z'; c++) {
|
|
|
|
std::string k = std::string(8, c);
|
|
|
|
// seqno=4 is less than 3 so we still should get our key
|
|
|
|
InternalKey ik(k, 4, kValueTypeForSeek);
|
|
|
|
iter->Seek(ik.Encode());
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
|
|
|
|
ParsedInternalKey pik;
|
2020-10-28 17:11:13 +00:00
|
|
|
ASSERT_OK(ParseInternalKey(iter->key(), &pik, true /* log_err_key */));
|
2016-10-18 23:59:37 +00:00
|
|
|
|
|
|
|
ASSERT_EQ(pik.type, ValueType::kTypeValue);
|
|
|
|
ASSERT_EQ(pik.sequence, 3);
|
|
|
|
ASSERT_EQ(pik.user_key.ToString(), k);
|
|
|
|
ASSERT_EQ(iter->value().ToString(), k);
|
|
|
|
}
|
|
|
|
|
|
|
|
delete iter;
|
|
|
|
}
|
|
|
|
|
2018-06-05 02:59:44 +00:00
|
|
|
TEST_P(BlockBasedTableTest, BlockAlignTest) {
|
|
|
|
BlockBasedTableOptions bbto = GetBlockBasedTableOptions();
|
2018-03-27 03:14:24 +00:00
|
|
|
bbto.block_align = true;
|
|
|
|
test::StringSink* sink = new test::StringSink();
|
2021-01-04 23:59:52 +00:00
|
|
|
std::unique_ptr<FSWritableFile> holder(sink);
|
|
|
|
std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
|
|
|
|
std::move(holder), "" /* don't care */, FileOptions()));
|
2018-03-27 03:14:24 +00:00
|
|
|
Options options;
|
|
|
|
options.compression = kNoCompression;
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(bbto));
|
2024-04-27 03:05:30 +00:00
|
|
|
ASSERT_OK(options.table_factory->ValidateOptions(
|
|
|
|
DBOptions(options), ColumnFamilyOptions(options)));
|
2021-05-05 20:59:21 +00:00
|
|
|
const ImmutableOptions ioptions(options);
|
2018-05-21 21:33:55 +00:00
|
|
|
const MutableCFOptions moptions(options);
|
2018-03-27 03:14:24 +00:00
|
|
|
InternalKeyComparator ikc(options.comparator);
|
2024-02-02 22:14:43 +00:00
|
|
|
InternalTblPropCollFactories internal_tbl_prop_coll_factories;
|
2018-03-27 03:14:24 +00:00
|
|
|
std::string column_family_name;
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
const ReadOptions read_options;
|
|
|
|
const WriteOptions write_options;
|
2018-03-27 03:14:24 +00:00
|
|
|
std::unique_ptr<TableBuilder> builder(options.table_factory->NewTableBuilder(
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
TableBuilderOptions(ioptions, moptions, read_options, write_options, ikc,
|
2024-02-02 22:14:43 +00:00
|
|
|
&internal_tbl_prop_coll_factories, kNoCompression,
|
Add more LSM info to FilterBuildingContext (#8246)
Summary:
Add `num_levels`, `is_bottommost`, and table file creation
`reason` to `FilterBuildingContext`, in anticipation of more powerful
Bloom-like filter support.
To support this, added `is_bottommost` and `reason` to
`TableBuilderOptions`, which allowed removing `reason` parameter from
`rocksdb::BuildTable`.
I attempted to remove `skip_filters` from `TableBuilderOptions`, because
filter construction decisions should arise from options, not one-off
parameters. I could not completely remove it because the public API for
SstFileWriter takes a `skip_filters` parameter, and translating this
into an option change would mean awkwardly replacing the table_factory
if it is BlockBasedTableFactory with new filter_policy=nullptr option.
I marked this public skip_filters option as deprecated because of this
oddity. (skip_filters on the read side probably makes sense.)
At least `skip_filters` is now largely hidden for users of
`TableBuilderOptions` and is no longer used for implementing the
optimize_filters_for_hits option. Bringing the logic for that option
closer to handling of FilterBuildingContext makes it more obvious that
hese two are using the same notion of "bottommost." (Planned:
configuration options for Bloom-like filters that generalize
`optimize_filters_for_hits`)
Recommended follow-up: Try to get away from "bottommost level" naming of
things, which is inaccurate (see
VersionStorageInfo::RangeMightExistAfterSortedRun), and move to
"bottommost run" or just "bottommost."
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8246
Test Plan:
extended an existing unit test to exercise and check various
filter building contexts. Also, existing tests for
optimize_filters_for_hits validate some of the "bottommost" handling,
which is now closely connected to FilterBuildingContext::is_bottommost
through TableBuilderOptions::is_bottommost
Reviewed By: mrambacher
Differential Revision: D28099346
Pulled By: pdillinger
fbshipit-source-id: 2c1072e29c24d4ac404c761a7b7663292372600a
2021-04-30 20:49:24 +00:00
|
|
|
CompressionOptions(), kUnknownColumnFamily,
|
2024-11-01 17:08:35 +00:00
|
|
|
column_family_name, -1, kUnknownNewestKeyTime),
|
2018-03-27 03:14:24 +00:00
|
|
|
file_writer.get()));
|
|
|
|
|
|
|
|
for (int i = 1; i <= 10000; ++i) {
|
|
|
|
std::ostringstream ostr;
|
|
|
|
ostr << std::setfill('0') << std::setw(5) << i;
|
|
|
|
std::string key = ostr.str();
|
|
|
|
std::string value = "val";
|
|
|
|
InternalKey ik(key, 0, kTypeValue);
|
|
|
|
|
|
|
|
builder->Add(ik.Encode(), value);
|
|
|
|
}
|
|
|
|
ASSERT_OK(builder->Finish());
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
ASSERT_OK(file_writer->Flush(IOOptions()));
|
2018-03-27 03:14:24 +00:00
|
|
|
|
2021-01-04 23:59:52 +00:00
|
|
|
std::unique_ptr<FSRandomAccessFile> source(
|
|
|
|
new test::StringSource(sink->contents(), 73342, false));
|
2018-11-09 19:17:34 +00:00
|
|
|
std::unique_ptr<RandomAccessFileReader> file_reader(
|
2021-01-04 23:59:52 +00:00
|
|
|
new RandomAccessFileReader(std::move(source), "test"));
|
2018-03-27 03:14:24 +00:00
|
|
|
// Helper function to get version, global_seqno, global_seqno_offset
|
|
|
|
std::function<void()> VerifyBlockAlignment = [&]() {
|
Improve / clean up meta block code & integrity (#9163)
Summary:
* Checksums are now checked on meta blocks unless specifically
suppressed or not applicable (e.g. plain table). (Was other way around.)
This means a number of cases that were not checking checksums now are,
including direct read TableProperties in Version::GetTableProperties
(fixed in meta_blocks ReadTableProperties), reading any block from
PersistentCache (fixed in BlockFetcher), read TableProperties in
SstFileDumper (ldb/sst_dump/BackupEngine) before table reader open,
maybe more.
* For that to work, I moved the global_seqno+TableProperties checksum
logic to the shared table/ code, because that is used by many utilies
such as SstFileDumper.
* Also for that to work, we have to know when we're dealing with a block
that has a checksum (trailer), so added that capability to Footer based
on magic number, and from there BlockFetcher.
* Knowledge of trailer presence has also fixed a problem where other
table formats were reading blocks including bytes for a non-existant
trailer--and awkwardly kind-of not using them, e.g. no shared code
checking checksums. (BlockFetcher compression type was populated
incorrectly.) Now we only read what is needed.
* Minimized code duplication and differing/incompatible/awkward
abstractions in meta_blocks.{cc,h} (e.g. SeekTo in metaindex block
without parsing block handle)
* Moved some meta block handling code from table_properties*.*
* Moved some code specific to block-based table from shared table/ code
to BlockBasedTable class. The checksum stuff means we can't completely
separate it, but things that don't need to be in shared table/ code
should not be.
* Use unique_ptr rather than raw ptr in more places. (Note: you can
std::move from unique_ptr to shared_ptr.)
Without enhancements to GetPropertiesOfAllTablesTest (see below),
net reduction of roughly 100 lines of code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9163
Test Plan:
existing tests and
* Enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to verify that
checksums are now checked on direct read of table properties by TableCache
(new test would fail before this change)
* Also enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to test
putting table properties under old meta name
* Also generally enhanced that same test to actually test what it was
supposed to be testing already, by kicking things out of table cache when
we don't want them there.
Reviewed By: ajkr, mrambacher
Differential Revision: D32514757
Pulled By: pdillinger
fbshipit-source-id: 507964b9311d186ae8d1131182290cbd97a99fa9
2021-11-18 19:42:12 +00:00
|
|
|
std::unique_ptr<TableProperties> props;
|
2021-01-04 23:59:52 +00:00
|
|
|
ASSERT_OK(ReadTableProperties(file_reader.get(), sink->contents().size(),
|
Improve / clean up meta block code & integrity (#9163)
Summary:
* Checksums are now checked on meta blocks unless specifically
suppressed or not applicable (e.g. plain table). (Was other way around.)
This means a number of cases that were not checking checksums now are,
including direct read TableProperties in Version::GetTableProperties
(fixed in meta_blocks ReadTableProperties), reading any block from
PersistentCache (fixed in BlockFetcher), read TableProperties in
SstFileDumper (ldb/sst_dump/BackupEngine) before table reader open,
maybe more.
* For that to work, I moved the global_seqno+TableProperties checksum
logic to the shared table/ code, because that is used by many utilies
such as SstFileDumper.
* Also for that to work, we have to know when we're dealing with a block
that has a checksum (trailer), so added that capability to Footer based
on magic number, and from there BlockFetcher.
* Knowledge of trailer presence has also fixed a problem where other
table formats were reading blocks including bytes for a non-existant
trailer--and awkwardly kind-of not using them, e.g. no shared code
checking checksums. (BlockFetcher compression type was populated
incorrectly.) Now we only read what is needed.
* Minimized code duplication and differing/incompatible/awkward
abstractions in meta_blocks.{cc,h} (e.g. SeekTo in metaindex block
without parsing block handle)
* Moved some meta block handling code from table_properties*.*
* Moved some code specific to block-based table from shared table/ code
to BlockBasedTable class. The checksum stuff means we can't completely
separate it, but things that don't need to be in shared table/ code
should not be.
* Use unique_ptr rather than raw ptr in more places. (Note: you can
std::move from unique_ptr to shared_ptr.)
Without enhancements to GetPropertiesOfAllTablesTest (see below),
net reduction of roughly 100 lines of code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9163
Test Plan:
existing tests and
* Enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to verify that
checksums are now checked on direct read of table properties by TableCache
(new test would fail before this change)
* Also enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to test
putting table properties under old meta name
* Also generally enhanced that same test to actually test what it was
supposed to be testing already, by kicking things out of table cache when
we don't want them there.
Reviewed By: ajkr, mrambacher
Differential Revision: D32514757
Pulled By: pdillinger
fbshipit-source-id: 507964b9311d186ae8d1131182290cbd97a99fa9
2021-11-18 19:42:12 +00:00
|
|
|
kBlockBasedTableMagicNumber, ioptions,
|
2023-04-21 16:07:18 +00:00
|
|
|
read_options, &props));
|
2018-03-27 03:14:24 +00:00
|
|
|
|
|
|
|
uint64_t data_block_size = props->data_size / props->num_data_blocks;
|
|
|
|
ASSERT_EQ(data_block_size, 4096);
|
|
|
|
ASSERT_EQ(props->data_size, data_block_size * props->num_data_blocks);
|
|
|
|
};
|
|
|
|
|
|
|
|
VerifyBlockAlignment();
|
|
|
|
|
|
|
|
// The below block of code verifies that we can read back the keys. Set
|
|
|
|
// block_align to false when creating the reader to ensure we can flip between
|
|
|
|
// the two modes without any issues
|
|
|
|
std::unique_ptr<TableReader> table_reader;
|
|
|
|
bbto.block_align = false;
|
|
|
|
Options options2;
|
2024-04-27 03:05:30 +00:00
|
|
|
options2.compression = kNoCompression;
|
2018-03-27 03:14:24 +00:00
|
|
|
options2.table_factory.reset(NewBlockBasedTableFactory(bbto));
|
2024-04-27 03:05:30 +00:00
|
|
|
ASSERT_OK(options2.table_factory->ValidateOptions(
|
|
|
|
DBOptions(options2), ColumnFamilyOptions(options2)));
|
2021-05-05 20:59:21 +00:00
|
|
|
ImmutableOptions ioptions2(options2);
|
2018-05-21 21:33:55 +00:00
|
|
|
const MutableCFOptions moptions2(options2);
|
|
|
|
|
2024-10-17 21:13:20 +00:00
|
|
|
ASSERT_OK(moptions.table_factory->NewTableReader(
|
2022-01-21 19:36:36 +00:00
|
|
|
TableReaderOptions(ioptions2, moptions2.prefix_extractor, EnvOptions(),
|
2023-04-25 19:08:23 +00:00
|
|
|
GetPlainInternalComparator(options2.comparator),
|
|
|
|
0 /* block_protection_bytes_per_key */),
|
2021-01-04 23:59:52 +00:00
|
|
|
std::move(file_reader), sink->contents().size(), &table_reader));
|
2018-03-27 03:14:24 +00:00
|
|
|
|
2018-05-21 21:33:55 +00:00
|
|
|
std::unique_ptr<InternalIterator> db_iter(table_reader->NewIterator(
|
2020-08-03 22:21:56 +00:00
|
|
|
read_options, moptions2.prefix_extractor.get(), /*arena=*/nullptr,
|
2019-06-20 21:28:22 +00:00
|
|
|
/*skip_filters=*/false, TableReaderCaller::kUncategorized));
|
2018-03-27 03:14:24 +00:00
|
|
|
|
|
|
|
int expected_key = 1;
|
|
|
|
for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) {
|
|
|
|
std::ostringstream ostr;
|
|
|
|
ostr << std::setfill('0') << std::setw(5) << expected_key++;
|
|
|
|
std::string key = ostr.str();
|
|
|
|
std::string value = "val";
|
|
|
|
|
|
|
|
ASSERT_OK(db_iter->status());
|
|
|
|
ASSERT_EQ(ExtractUserKey(db_iter->key()).ToString(), key);
|
|
|
|
ASSERT_EQ(db_iter->value().ToString(), value);
|
|
|
|
}
|
|
|
|
expected_key--;
|
|
|
|
ASSERT_EQ(expected_key, 10000);
|
|
|
|
table_reader.reset();
|
|
|
|
}
|
|
|
|
|
2024-04-22 21:07:34 +00:00
|
|
|
TEST_P(BlockBasedTableTest, FixBlockAlignMismatchedFileChecksums) {
|
|
|
|
Options options;
|
|
|
|
options.create_if_missing = true;
|
|
|
|
options.compression = kNoCompression;
|
|
|
|
options.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
|
|
|
|
BlockBasedTableOptions bbto;
|
|
|
|
bbto.block_align = true;
|
|
|
|
bbto.block_size = 1024;
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(bbto));
|
2024-04-27 03:05:30 +00:00
|
|
|
ASSERT_OK(options.table_factory->ValidateOptions(
|
|
|
|
DBOptions(options), ColumnFamilyOptions(options)));
|
2024-04-22 21:07:34 +00:00
|
|
|
const std::string kDBPath =
|
|
|
|
test::PerThreadDBPath("block_align_padded_bytes_verify_file_checksums");
|
|
|
|
ASSERT_OK(DestroyDB(kDBPath, options));
|
|
|
|
DB* db;
|
|
|
|
ASSERT_OK(DB::Open(options, kDBPath, &db));
|
|
|
|
ASSERT_OK(db->Put(WriteOptions(), "k1", "v1"));
|
|
|
|
ASSERT_OK(db->Flush(FlushOptions()));
|
|
|
|
// Before the fix, VerifyFileChecksums() will fail as padded bytes from
|
|
|
|
// aligning blocks are used to generate the checksum to compare against the
|
|
|
|
// one not generated by padded bytes
|
|
|
|
ASSERT_OK(db->VerifyFileChecksums(ReadOptions()));
|
|
|
|
delete db;
|
|
|
|
}
|
|
|
|
|
2024-04-30 22:38:53 +00:00
|
|
|
class NoBufferAlignmenttWritableFile : public FSWritableFileOwnerWrapper {
|
|
|
|
public:
|
|
|
|
explicit NoBufferAlignmenttWritableFile(
|
|
|
|
std::unique_ptr<FSWritableFile>&& file)
|
|
|
|
: FSWritableFileOwnerWrapper(std::move(file)) {}
|
|
|
|
size_t GetRequiredBufferAlignment() const override { return 1; }
|
|
|
|
};
|
|
|
|
|
|
|
|
class NoBufferAlignmenttWritableFileFileSystem : public FileSystemWrapper {
|
|
|
|
public:
|
|
|
|
explicit NoBufferAlignmenttWritableFileFileSystem(
|
|
|
|
const std::shared_ptr<FileSystem>& base)
|
|
|
|
: FileSystemWrapper(base) {}
|
|
|
|
|
|
|
|
static const char* kClassName() {
|
|
|
|
return "NoBufferAlignmenttWritableFileFileSystem";
|
|
|
|
}
|
|
|
|
const char* Name() const override { return kClassName(); }
|
|
|
|
|
|
|
|
IOStatus NewWritableFile(const std::string& fname,
|
|
|
|
const FileOptions& file_opts,
|
|
|
|
std::unique_ptr<FSWritableFile>* result,
|
|
|
|
IODebugContext* dbg) override {
|
|
|
|
IOStatus s = target()->NewWritableFile(fname, file_opts, result, dbg);
|
|
|
|
EXPECT_OK(s);
|
|
|
|
result->reset(new NoBufferAlignmenttWritableFile(std::move(*result)));
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
TEST_P(BlockBasedTableTest,
|
|
|
|
FixBlockAlignFlushDuringPadMismatchedFileChecksums) {
|
|
|
|
Options options;
|
|
|
|
options.create_if_missing = true;
|
|
|
|
options.compression = kNoCompression;
|
|
|
|
options.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
|
|
|
|
// To force flush during pad by enforcing a small buffer size
|
|
|
|
options.writable_file_max_buffer_size = 1;
|
|
|
|
// To help enforce a small buffer size by removing buffer alignment
|
|
|
|
Env* raw_env = Env::Default();
|
|
|
|
std::shared_ptr<NoBufferAlignmenttWritableFileFileSystem> fs =
|
|
|
|
std::make_shared<NoBufferAlignmenttWritableFileFileSystem>(
|
|
|
|
raw_env->GetFileSystem());
|
|
|
|
std::unique_ptr<Env> env(new CompositeEnvWrapper(raw_env, fs));
|
|
|
|
options.env = env.get();
|
|
|
|
|
|
|
|
BlockBasedTableOptions bbto;
|
|
|
|
bbto.block_align = true;
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(bbto));
|
|
|
|
const std::string kDBPath = test::PerThreadDBPath(
|
|
|
|
"block_align_flush_during_flush_verify_file_checksums");
|
|
|
|
ASSERT_OK(DestroyDB(kDBPath, options));
|
|
|
|
DB* db;
|
|
|
|
ASSERT_OK(DB::Open(options, kDBPath, &db));
|
|
|
|
|
|
|
|
ASSERT_OK(db->Put(WriteOptions(), "k1", "k2"));
|
|
|
|
ASSERT_OK(db->Flush(FlushOptions()));
|
|
|
|
|
|
|
|
// Before the fix, VerifyFileChecksums() will fail as incorrect padded bytes
|
|
|
|
// were used to generate checksum upon file creation
|
|
|
|
ASSERT_OK(db->VerifyFileChecksums(ReadOptions()));
|
|
|
|
delete db;
|
|
|
|
}
|
|
|
|
|
2018-06-12 19:46:48 +00:00
|
|
|
TEST_P(BlockBasedTableTest, PropertiesBlockRestartPointTest) {
|
|
|
|
BlockBasedTableOptions bbto = GetBlockBasedTableOptions();
|
|
|
|
bbto.block_align = true;
|
|
|
|
test::StringSink* sink = new test::StringSink();
|
2021-01-04 23:59:52 +00:00
|
|
|
std::unique_ptr<FSWritableFile> holder(sink);
|
|
|
|
std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
|
|
|
|
std::move(holder), "" /* don't care */, FileOptions()));
|
2018-06-12 19:46:48 +00:00
|
|
|
|
|
|
|
Options options;
|
|
|
|
options.compression = kNoCompression;
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(bbto));
|
2024-04-27 03:05:30 +00:00
|
|
|
ASSERT_OK(options.table_factory->ValidateOptions(
|
|
|
|
DBOptions(options), ColumnFamilyOptions(options)));
|
2018-06-12 19:46:48 +00:00
|
|
|
|
2021-05-05 20:59:21 +00:00
|
|
|
const ImmutableOptions ioptions(options);
|
2018-06-12 19:46:48 +00:00
|
|
|
const MutableCFOptions moptions(options);
|
|
|
|
InternalKeyComparator ikc(options.comparator);
|
2024-02-02 22:14:43 +00:00
|
|
|
InternalTblPropCollFactories internal_tbl_prop_coll_factories;
|
2018-06-12 19:46:48 +00:00
|
|
|
std::string column_family_name;
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
const ReadOptions read_options;
|
|
|
|
const WriteOptions write_options;
|
2018-06-12 19:46:48 +00:00
|
|
|
std::unique_ptr<TableBuilder> builder(options.table_factory->NewTableBuilder(
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
TableBuilderOptions(ioptions, moptions, read_options, write_options, ikc,
|
2024-02-02 22:14:43 +00:00
|
|
|
&internal_tbl_prop_coll_factories, kNoCompression,
|
Add more LSM info to FilterBuildingContext (#8246)
Summary:
Add `num_levels`, `is_bottommost`, and table file creation
`reason` to `FilterBuildingContext`, in anticipation of more powerful
Bloom-like filter support.
To support this, added `is_bottommost` and `reason` to
`TableBuilderOptions`, which allowed removing `reason` parameter from
`rocksdb::BuildTable`.
I attempted to remove `skip_filters` from `TableBuilderOptions`, because
filter construction decisions should arise from options, not one-off
parameters. I could not completely remove it because the public API for
SstFileWriter takes a `skip_filters` parameter, and translating this
into an option change would mean awkwardly replacing the table_factory
if it is BlockBasedTableFactory with new filter_policy=nullptr option.
I marked this public skip_filters option as deprecated because of this
oddity. (skip_filters on the read side probably makes sense.)
At least `skip_filters` is now largely hidden for users of
`TableBuilderOptions` and is no longer used for implementing the
optimize_filters_for_hits option. Bringing the logic for that option
closer to handling of FilterBuildingContext makes it more obvious that
hese two are using the same notion of "bottommost." (Planned:
configuration options for Bloom-like filters that generalize
`optimize_filters_for_hits`)
Recommended follow-up: Try to get away from "bottommost level" naming of
things, which is inaccurate (see
VersionStorageInfo::RangeMightExistAfterSortedRun), and move to
"bottommost run" or just "bottommost."
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8246
Test Plan:
extended an existing unit test to exercise and check various
filter building contexts. Also, existing tests for
optimize_filters_for_hits validate some of the "bottommost" handling,
which is now closely connected to FilterBuildingContext::is_bottommost
through TableBuilderOptions::is_bottommost
Reviewed By: mrambacher
Differential Revision: D28099346
Pulled By: pdillinger
fbshipit-source-id: 2c1072e29c24d4ac404c761a7b7663292372600a
2021-04-30 20:49:24 +00:00
|
|
|
CompressionOptions(), kUnknownColumnFamily,
|
2024-11-01 17:08:35 +00:00
|
|
|
column_family_name, -1, kUnknownNewestKeyTime),
|
2018-06-12 19:46:48 +00:00
|
|
|
file_writer.get()));
|
|
|
|
|
|
|
|
for (int i = 1; i <= 10000; ++i) {
|
|
|
|
std::ostringstream ostr;
|
|
|
|
ostr << std::setfill('0') << std::setw(5) << i;
|
|
|
|
std::string key = ostr.str();
|
|
|
|
std::string value = "val";
|
|
|
|
InternalKey ik(key, 0, kTypeValue);
|
|
|
|
|
|
|
|
builder->Add(ik.Encode(), value);
|
|
|
|
}
|
|
|
|
ASSERT_OK(builder->Finish());
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
ASSERT_OK(file_writer->Flush(IOOptions()));
|
2018-06-12 19:46:48 +00:00
|
|
|
|
2021-01-04 23:59:52 +00:00
|
|
|
std::unique_ptr<FSRandomAccessFile> source(
|
|
|
|
new test::StringSource(sink->contents(), 73342, true));
|
2018-11-09 19:17:34 +00:00
|
|
|
std::unique_ptr<RandomAccessFileReader> file_reader(
|
2021-01-04 23:59:52 +00:00
|
|
|
new RandomAccessFileReader(std::move(source), "test"));
|
2018-06-12 19:46:48 +00:00
|
|
|
|
|
|
|
{
|
|
|
|
RandomAccessFileReader* file = file_reader.get();
|
2021-01-04 23:59:52 +00:00
|
|
|
uint64_t file_size = sink->contents().size();
|
2018-06-12 19:46:48 +00:00
|
|
|
|
|
|
|
Footer footer;
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
ASSERT_OK(ReadFooterFromFile(IOOptions(), file, *FileSystem::Default(),
|
2022-12-09 18:03:47 +00:00
|
|
|
nullptr /* prefetch_buffer */, file_size,
|
|
|
|
&footer, kBlockBasedTableMagicNumber));
|
2018-06-12 19:46:48 +00:00
|
|
|
|
2019-06-19 02:00:03 +00:00
|
|
|
auto BlockFetchHelper = [&](const BlockHandle& handle, BlockType block_type,
|
2018-06-12 19:46:48 +00:00
|
|
|
BlockContents* contents) {
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
ReadOptions read_options_for_helper;
|
|
|
|
read_options_for_helper.verify_checksums = false;
|
2018-06-12 19:46:48 +00:00
|
|
|
PersistentCacheOptions cache_options;
|
|
|
|
|
2018-11-29 01:58:08 +00:00
|
|
|
BlockFetcher block_fetcher(
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
file, nullptr /* prefetch_buffer */, footer, read_options_for_helper,
|
|
|
|
handle, contents, ioptions, false /* decompress */,
|
2019-06-19 02:00:03 +00:00
|
|
|
false /*maybe_compressed*/, block_type,
|
|
|
|
UncompressionDict::GetEmptyDict(), cache_options);
|
2018-06-12 19:46:48 +00:00
|
|
|
|
|
|
|
ASSERT_OK(block_fetcher.ReadBlockContents());
|
|
|
|
};
|
|
|
|
|
|
|
|
// -- Read metaindex block
|
|
|
|
auto metaindex_handle = footer.metaindex_handle();
|
|
|
|
BlockContents metaindex_contents;
|
|
|
|
|
2019-06-19 02:00:03 +00:00
|
|
|
BlockFetchHelper(metaindex_handle, BlockType::kMetaIndex,
|
2019-06-19 21:07:36 +00:00
|
|
|
&metaindex_contents);
|
2020-02-25 23:29:17 +00:00
|
|
|
Block metaindex_block(std::move(metaindex_contents));
|
2018-06-12 19:46:48 +00:00
|
|
|
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-25 03:50:35 +00:00
|
|
|
std::unique_ptr<InternalIterator> meta_iter(metaindex_block.NewDataIterator(
|
2020-07-08 00:25:08 +00:00
|
|
|
BytewiseComparator(), kDisableGlobalSequenceNumber));
|
2018-06-12 19:46:48 +00:00
|
|
|
|
|
|
|
// -- Read properties block
|
|
|
|
BlockHandle properties_handle;
|
2021-12-10 16:12:09 +00:00
|
|
|
ASSERT_OK(FindOptionalMetaBlock(meta_iter.get(), kPropertiesBlockName,
|
Improve / clean up meta block code & integrity (#9163)
Summary:
* Checksums are now checked on meta blocks unless specifically
suppressed or not applicable (e.g. plain table). (Was other way around.)
This means a number of cases that were not checking checksums now are,
including direct read TableProperties in Version::GetTableProperties
(fixed in meta_blocks ReadTableProperties), reading any block from
PersistentCache (fixed in BlockFetcher), read TableProperties in
SstFileDumper (ldb/sst_dump/BackupEngine) before table reader open,
maybe more.
* For that to work, I moved the global_seqno+TableProperties checksum
logic to the shared table/ code, because that is used by many utilies
such as SstFileDumper.
* Also for that to work, we have to know when we're dealing with a block
that has a checksum (trailer), so added that capability to Footer based
on magic number, and from there BlockFetcher.
* Knowledge of trailer presence has also fixed a problem where other
table formats were reading blocks including bytes for a non-existant
trailer--and awkwardly kind-of not using them, e.g. no shared code
checking checksums. (BlockFetcher compression type was populated
incorrectly.) Now we only read what is needed.
* Minimized code duplication and differing/incompatible/awkward
abstractions in meta_blocks.{cc,h} (e.g. SeekTo in metaindex block
without parsing block handle)
* Moved some meta block handling code from table_properties*.*
* Moved some code specific to block-based table from shared table/ code
to BlockBasedTable class. The checksum stuff means we can't completely
separate it, but things that don't need to be in shared table/ code
should not be.
* Use unique_ptr rather than raw ptr in more places. (Note: you can
std::move from unique_ptr to shared_ptr.)
Without enhancements to GetPropertiesOfAllTablesTest (see below),
net reduction of roughly 100 lines of code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9163
Test Plan:
existing tests and
* Enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to verify that
checksums are now checked on direct read of table properties by TableCache
(new test would fail before this change)
* Also enhanced DBTablePropertiesTest.GetPropertiesOfAllTablesTest to test
putting table properties under old meta name
* Also generally enhanced that same test to actually test what it was
supposed to be testing already, by kicking things out of table cache when
we don't want them there.
Reviewed By: ajkr, mrambacher
Differential Revision: D32514757
Pulled By: pdillinger
fbshipit-source-id: 507964b9311d186ae8d1131182290cbd97a99fa9
2021-11-18 19:42:12 +00:00
|
|
|
&properties_handle));
|
|
|
|
ASSERT_FALSE(properties_handle.IsNull());
|
2018-06-12 19:46:48 +00:00
|
|
|
BlockContents properties_contents;
|
2019-06-19 02:00:03 +00:00
|
|
|
BlockFetchHelper(properties_handle, BlockType::kProperties,
|
2019-06-19 21:07:36 +00:00
|
|
|
&properties_contents);
|
2020-02-25 23:29:17 +00:00
|
|
|
Block properties_block(std::move(properties_contents));
|
2018-06-12 19:46:48 +00:00
|
|
|
|
2019-09-09 18:22:28 +00:00
|
|
|
ASSERT_EQ(properties_block.NumRestarts(), 1u);
|
2018-06-12 19:46:48 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-04-22 04:57:40 +00:00
|
|
|
TEST_P(BlockBasedTableTest, CompressionRatioThreshold) {
|
2023-04-22 19:41:36 +00:00
|
|
|
for (CompressionType type : GetSupportedCompressions()) {
|
|
|
|
if (type == kNoCompression) {
|
|
|
|
continue;
|
|
|
|
}
|
2023-04-24 16:33:33 +00:00
|
|
|
if (type == kBZip2Compression) {
|
|
|
|
// Weird behavior in this test
|
|
|
|
continue;
|
|
|
|
}
|
2023-04-22 19:41:36 +00:00
|
|
|
SCOPED_TRACE("Compression type: " + std::to_string(type));
|
2023-04-22 04:57:40 +00:00
|
|
|
|
2023-04-22 19:41:36 +00:00
|
|
|
Options options;
|
|
|
|
options.compression = type;
|
2023-04-22 04:57:40 +00:00
|
|
|
|
2023-04-22 19:41:36 +00:00
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
|
|
|
int len = 10000;
|
|
|
|
Random rnd(301);
|
|
|
|
std::vector<std::string> keys;
|
|
|
|
stl_wrappers::KVMap kvmap;
|
2023-04-22 04:57:40 +00:00
|
|
|
|
2023-04-22 19:41:36 +00:00
|
|
|
// Test the max_compressed_bytes_per_kb option
|
|
|
|
for (int threshold : {0, 1, 100, 400, 600, 900, 1024}) {
|
|
|
|
SCOPED_TRACE("threshold=" + std::to_string(threshold));
|
|
|
|
options.compression_opts.max_compressed_bytes_per_kb = threshold;
|
|
|
|
ImmutableOptions ioptions(options);
|
|
|
|
MutableCFOptions moptions(options);
|
|
|
|
|
|
|
|
for (double compressible_to : {0.25, 0.75}) {
|
|
|
|
SCOPED_TRACE("compressible_to=" + std::to_string(compressible_to));
|
|
|
|
TableConstructor c(BytewiseComparator(),
|
|
|
|
true /* convert_to_internal_key_ */);
|
|
|
|
std::string buf;
|
|
|
|
c.Add("x", test::CompressibleString(&rnd, compressible_to, len, &buf));
|
|
|
|
|
|
|
|
// write an SST file
|
|
|
|
c.Finish(options, ioptions, moptions, table_options,
|
|
|
|
GetPlainInternalComparator(options.comparator), &keys, &kvmap);
|
|
|
|
|
|
|
|
size_t table_file_size = c.TEST_GetSink()->contents().size();
|
|
|
|
size_t approx_sst_overhead = 1000;
|
|
|
|
if (compressible_to < threshold / 1024.0) {
|
|
|
|
// Should be compressed (substantial variance depending on algorithm)
|
|
|
|
EXPECT_NEAR2(len * compressible_to + approx_sst_overhead,
|
|
|
|
table_file_size, len / 8);
|
|
|
|
} else {
|
|
|
|
// Should not be compressed
|
|
|
|
EXPECT_NEAR2(len + approx_sst_overhead, table_file_size, len / 10);
|
|
|
|
}
|
2023-04-22 04:57:40 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-07-20 16:00:33 +00:00
|
|
|
TEST_P(BlockBasedTableTest, PropertiesMetaBlockLast) {
|
|
|
|
// The properties meta-block should come at the end since we always need to
|
|
|
|
// read it when opening a file, unlike index/filter/other meta-blocks, which
|
|
|
|
// are sometimes read depending on the user's configuration. This ordering
|
|
|
|
// allows us to do a small readahead on the end of the file to read properties
|
|
|
|
// and meta-index blocks with one I/O.
|
|
|
|
TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
|
|
|
|
c.Add("a1", "val1");
|
|
|
|
c.Add("b2", "val2");
|
|
|
|
c.Add("c3", "val3");
|
|
|
|
c.Add("d4", "val4");
|
|
|
|
c.Add("e5", "val5");
|
|
|
|
c.Add("f6", "val6");
|
|
|
|
c.Add("g7", "val7");
|
|
|
|
c.Add("h8", "val8");
|
|
|
|
c.Add("j9", "val9");
|
|
|
|
|
|
|
|
// write an SST file
|
|
|
|
Options options;
|
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
|
|
|
table_options.filter_policy.reset(NewBloomFilterPolicy(
|
|
|
|
8 /* bits_per_key */, false /* use_block_based_filter */));
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
2021-05-05 20:59:21 +00:00
|
|
|
ImmutableOptions ioptions(options);
|
2018-07-20 16:00:33 +00:00
|
|
|
MutableCFOptions moptions(options);
|
|
|
|
std::vector<std::string> keys;
|
|
|
|
stl_wrappers::KVMap kvmap;
|
|
|
|
c.Finish(options, ioptions, moptions, table_options,
|
|
|
|
GetPlainInternalComparator(options.comparator), &keys, &kvmap);
|
|
|
|
|
|
|
|
// get file reader
|
|
|
|
test::StringSink* table_sink = c.TEST_GetSink();
|
2021-01-04 23:59:52 +00:00
|
|
|
std::unique_ptr<FSRandomAccessFile> source(new test::StringSource(
|
|
|
|
table_sink->contents(), 0 /* unique_id */, false /* allow_mmap_reads */));
|
|
|
|
|
|
|
|
std::unique_ptr<RandomAccessFileReader> table_reader(
|
|
|
|
new RandomAccessFileReader(std::move(source), "test"));
|
2018-07-20 16:00:33 +00:00
|
|
|
size_t table_size = table_sink->contents().size();
|
|
|
|
|
|
|
|
// read footer
|
|
|
|
Footer footer;
|
2020-06-29 21:51:57 +00:00
|
|
|
IOOptions opts;
|
2022-12-09 18:03:47 +00:00
|
|
|
ASSERT_OK(ReadFooterFromFile(opts, table_reader.get(), *FileSystem::Default(),
|
2018-07-20 16:00:33 +00:00
|
|
|
nullptr /* prefetch_buffer */, table_size,
|
|
|
|
&footer, kBlockBasedTableMagicNumber));
|
|
|
|
|
|
|
|
// read metaindex
|
|
|
|
auto metaindex_handle = footer.metaindex_handle();
|
|
|
|
BlockContents metaindex_contents;
|
|
|
|
PersistentCacheOptions pcache_opts;
|
|
|
|
BlockFetcher block_fetcher(
|
|
|
|
table_reader.get(), nullptr /* prefetch_buffer */, footer, ReadOptions(),
|
|
|
|
metaindex_handle, &metaindex_contents, ioptions, false /* decompress */,
|
2019-06-19 02:00:03 +00:00
|
|
|
false /*maybe_compressed*/, BlockType::kMetaIndex,
|
|
|
|
UncompressionDict::GetEmptyDict(), pcache_opts,
|
|
|
|
nullptr /*memory_allocator*/);
|
2018-07-20 16:00:33 +00:00
|
|
|
ASSERT_OK(block_fetcher.ReadBlockContents());
|
2020-02-25 23:29:17 +00:00
|
|
|
Block metaindex_block(std::move(metaindex_contents));
|
2018-07-20 16:00:33 +00:00
|
|
|
|
|
|
|
// verify properties block comes last
|
|
|
|
std::unique_ptr<InternalIterator> metaindex_iter{
|
2021-12-21 19:29:52 +00:00
|
|
|
metaindex_block.NewMetaIterator()};
|
2018-07-20 16:00:33 +00:00
|
|
|
uint64_t max_offset = 0;
|
|
|
|
std::string key_at_max_offset;
|
|
|
|
for (metaindex_iter->SeekToFirst(); metaindex_iter->Valid();
|
|
|
|
metaindex_iter->Next()) {
|
|
|
|
BlockHandle handle;
|
|
|
|
Slice value = metaindex_iter->value();
|
|
|
|
ASSERT_OK(handle.DecodeFrom(&value));
|
|
|
|
if (handle.offset() > max_offset) {
|
|
|
|
max_offset = handle.offset();
|
|
|
|
key_at_max_offset = metaindex_iter->key().ToString();
|
|
|
|
}
|
|
|
|
}
|
2021-12-10 16:12:09 +00:00
|
|
|
ASSERT_EQ(kPropertiesBlockName, key_at_max_offset);
|
format_version=6 and context-aware block checksums (#9058)
Summary:
## Context checksum
All RocksDB checksums currently use 32 bits of checking
power, which should be 1 in 4 billion false negative (FN) probability (failing to
detect corruption). This is true for random corruptions, and in some cases
small corruptions are guaranteed to be detected. But some possible
corruptions, such as in storage metadata rather than storage payload data,
would have a much higher FN rate. For example:
* Data larger than one SST block is replaced by data from elsewhere in
the same or another SST file. Especially with block_align=true, the
probability of exact block size match is probably around 1 in 100, making
the FN probability around that same. Without `block_align=true` the
probability of same block start location is probably around 1 in 10,000,
for FN probability around 1 in a million.
To solve this problem in new format_version=6, we add "context awareness"
to block checksum checks. The stored and expected checksum value is
modified based on the block's position in the file and which file it is in. The
modifications are cleverly chosen so that, for example
* blocks within about 4GB of each other are guaranteed to use different context
* blocks that are offset by exactly some multiple of 4GiB are guaranteed to use
different context
* files generated by the same process are guaranteed to use different context
for the same offsets, until wrap-around after 2^32 - 1 files
Thus, with format_version=6, if a valid SST block and checksum is misplaced,
its checksum FN probability should be essentially ideal, 1 in 4B.
## Footer checksum
This change also adds checksum protection to the SST footer (with
format_version=6), for the first time without relying on whole file checksum.
To prevent a corruption of the format_version in the footer (e.g. 6 -> 5) to
defeat the footer checksum, we change much of the footer data format
including an "extended magic number" in format_version 6 that would be
interpreted as empty index and metaindex block handles in older footer
versions. We also change the encoding of handles to free up space for
other new data in footer.
## More detail: making space in footer
In order to keep footer the same size in format_version=6 (avoid change to IO
patterns), we have to free up some space for new data. We do this two ways:
* Metaindex block handle is encoded down to 4 bytes (from 10) by assuming
it immediately precedes the footer, and by assuming it is < 4GB.
* Index block handle is moved into metaindex. (I don't know why it was
in footer to begin with.)
## Performance
In case of small performance penalty, I've made a "pay as you go" optimization
to compensate: replace `MutableCFOptions` in BlockBasedTableBuilder::Rep
with the only field used in that structure after construction: `prefix_extractor`.
This makes the PR an overall performance improvement (results below).
Nevertheless I'm seeing essentially no difference going from fv=5 to fv=6,
even including that improvement for both. That's based on extreme case table
write performance testing, many files with many blocks. This is relatively
checksum intensive (small blocks) and salt generation intensive (small files).
```
(for I in `seq 1 100`; do TEST_TMPDIR=/dev/shm/dbbench2 ./db_bench -benchmarks=fillseq -memtablerep=vector -disable_wal=1 -allow_concurrent_memtable_write=false -num=3000000 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=0 -write_buffer_size=100000 -compression_type=none -block_size=1000; done) 2>&1 | grep micros/op | tee out
awk '{ tot += $5; n += 1; } END { print int(1.0 * tot / n) }' < out
```
Each value below is ops/s averaged over 100 runs, run simultaneously with competing
configuration for load fairness
Before -> after (both fv=5): 483530 -> 483673 (negligible)
Re-run 1: 480733 -> 485427 (1.0% faster)
Re-run 2: 483821 -> 484541 (0.1% faster)
Before (fv=5) -> after (fv=6): 482006 -> 485100 (0.6% faster)
Re-run 1: 482212 -> 485075 (0.6% faster)
Re-run 2: 483590 -> 484073 (0.1% faster)
After fv=5 -> after fv=6: 483878 -> 485542 (0.3% faster)
Re-run 1: 485331 -> 483385 (0.4% slower)
Re-run 2: 485283 -> 483435 (0.4% slower)
Re-run 3: 483647 -> 486109 (0.5% faster)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9058
Test Plan:
unit tests included (table_test, db_properties_test, salt in env_test). General DB tests
and crash test updated to test new format_version.
Also temporarily updated the default format version to 6 and saw some test failures. Almost all
were due to an inadvertent additional read in VerifyChecksum to verify the index block checksum,
though it's arguably a bug that VerifyChecksum does not appear to (re-)verify the index block
checksum, just assuming it was verified in opening the index reader (probably *usually* true but
probably not always true). Some other concerns about VerifyChecksum are left in FIXME
comments. The only remaining test failure on change of default (in block_fetcher_test) now
has a comment about how to upgrade the test.
The format compatibility test does not need updating because we have not updated the default
format_version.
Reviewed By: ajkr, mrambacher
Differential Revision: D33100915
Pulled By: pdillinger
fbshipit-source-id: 8679e3e572fa580181a737fd6d113ed53c5422ee
2023-07-30 23:40:01 +00:00
|
|
|
if (FormatVersionUsesIndexHandleInFooter(footer.format_version())) {
|
|
|
|
// If index handle is stored in footer rather than metaindex block,
|
|
|
|
// need separate logic to verify it comes before properties block.
|
|
|
|
ASSERT_GT(max_offset, footer.index_handle().offset());
|
|
|
|
} else {
|
|
|
|
ASSERT_TRUE(footer.index_handle().IsNull());
|
|
|
|
}
|
2018-07-20 16:00:33 +00:00
|
|
|
c.ResetTableReader();
|
|
|
|
}
|
|
|
|
|
2021-12-21 19:29:52 +00:00
|
|
|
TEST_P(BlockBasedTableTest, SeekMetaBlocks) {
|
|
|
|
TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
|
|
|
|
c.Add("foo_a1", "val1");
|
|
|
|
c.Add("foo_b2", "val2");
|
|
|
|
c.Add("foo_c3", "val3");
|
|
|
|
c.Add("foo_d4", "val4");
|
|
|
|
c.Add("foo_e5", "val5");
|
|
|
|
c.Add("foo_f6", "val6");
|
|
|
|
c.Add("foo_g7", "val7");
|
|
|
|
c.Add("foo_h8", "val8");
|
|
|
|
c.Add("foo_j9", "val9");
|
|
|
|
|
|
|
|
// write an SST file
|
|
|
|
Options options;
|
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
|
|
|
table_options.index_type = BlockBasedTableOptions::kHashSearch;
|
|
|
|
table_options.filter_policy.reset(NewBloomFilterPolicy(
|
|
|
|
8 /* bits_per_key */, false /* use_block_based_filter */));
|
|
|
|
options.prefix_extractor.reset(NewFixedPrefixTransform(4));
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
ImmutableOptions ioptions(options);
|
|
|
|
MutableCFOptions moptions(options);
|
|
|
|
std::vector<std::string> keys;
|
|
|
|
stl_wrappers::KVMap kvmap;
|
|
|
|
c.Finish(options, ioptions, moptions, table_options,
|
|
|
|
GetPlainInternalComparator(options.comparator), &keys, &kvmap);
|
|
|
|
|
|
|
|
// get file reader
|
|
|
|
test::StringSink* table_sink = c.TEST_GetSink();
|
|
|
|
std::unique_ptr<FSRandomAccessFile> source(new test::StringSource(
|
|
|
|
table_sink->contents(), 0 /* unique_id */, false /* allow_mmap_reads */));
|
|
|
|
|
|
|
|
std::unique_ptr<RandomAccessFileReader> table_reader(
|
|
|
|
new RandomAccessFileReader(std::move(source), "test"));
|
|
|
|
size_t table_size = table_sink->contents().size();
|
|
|
|
|
|
|
|
// read footer
|
|
|
|
Footer footer;
|
|
|
|
IOOptions opts;
|
2022-12-09 18:03:47 +00:00
|
|
|
ASSERT_OK(ReadFooterFromFile(opts, table_reader.get(), *FileSystem::Default(),
|
2021-12-21 19:29:52 +00:00
|
|
|
nullptr /* prefetch_buffer */, table_size,
|
|
|
|
&footer, kBlockBasedTableMagicNumber));
|
|
|
|
|
|
|
|
// read metaindex
|
|
|
|
auto metaindex_handle = footer.metaindex_handle();
|
|
|
|
BlockContents metaindex_contents;
|
|
|
|
PersistentCacheOptions pcache_opts;
|
|
|
|
BlockFetcher block_fetcher(
|
|
|
|
table_reader.get(), nullptr /* prefetch_buffer */, footer, ReadOptions(),
|
|
|
|
metaindex_handle, &metaindex_contents, ioptions, false /* decompress */,
|
|
|
|
false /*maybe_compressed*/, BlockType::kMetaIndex,
|
|
|
|
UncompressionDict::GetEmptyDict(), pcache_opts,
|
|
|
|
nullptr /*memory_allocator*/);
|
|
|
|
ASSERT_OK(block_fetcher.ReadBlockContents());
|
|
|
|
Block metaindex_block(std::move(metaindex_contents));
|
|
|
|
|
|
|
|
// verify properties block comes last
|
|
|
|
std::unique_ptr<MetaBlockIter> metaindex_iter(
|
|
|
|
metaindex_block.NewMetaIterator());
|
|
|
|
bool has_hash_prefixes = false;
|
|
|
|
bool has_hash_metadata = false;
|
|
|
|
for (metaindex_iter->SeekToFirst(); metaindex_iter->Valid();
|
|
|
|
metaindex_iter->Next()) {
|
|
|
|
if (metaindex_iter->key().ToString() == kHashIndexPrefixesBlock) {
|
|
|
|
has_hash_prefixes = true;
|
|
|
|
} else if (metaindex_iter->key().ToString() ==
|
|
|
|
kHashIndexPrefixesMetadataBlock) {
|
|
|
|
has_hash_metadata = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (has_hash_metadata) {
|
|
|
|
metaindex_iter->Seek(kHashIndexPrefixesMetadataBlock);
|
|
|
|
ASSERT_TRUE(metaindex_iter->Valid());
|
|
|
|
ASSERT_EQ(kHashIndexPrefixesMetadataBlock,
|
|
|
|
metaindex_iter->key().ToString());
|
|
|
|
}
|
|
|
|
if (has_hash_prefixes) {
|
|
|
|
metaindex_iter->Seek(kHashIndexPrefixesBlock);
|
|
|
|
ASSERT_TRUE(metaindex_iter->Valid());
|
|
|
|
ASSERT_EQ(kHashIndexPrefixesBlock, metaindex_iter->key().ToString());
|
|
|
|
}
|
|
|
|
c.ResetTableReader();
|
|
|
|
}
|
|
|
|
|
2018-06-05 02:59:44 +00:00
|
|
|
TEST_P(BlockBasedTableTest, BadOptions) {
|
2020-02-20 20:07:53 +00:00
|
|
|
ROCKSDB_NAMESPACE::Options options;
|
2018-03-27 03:14:24 +00:00
|
|
|
options.compression = kNoCompression;
|
2024-04-27 03:05:30 +00:00
|
|
|
options.create_if_missing = true;
|
2018-06-05 02:59:44 +00:00
|
|
|
BlockBasedTableOptions bbto = GetBlockBasedTableOptions();
|
2018-03-27 03:14:24 +00:00
|
|
|
bbto.block_size = 4000;
|
|
|
|
bbto.block_align = true;
|
|
|
|
|
2018-06-08 19:53:23 +00:00
|
|
|
const std::string kDBPath =
|
2018-07-14 00:18:39 +00:00
|
|
|
test::PerThreadDBPath("block_based_table_bad_options_test");
|
2018-03-27 03:14:24 +00:00
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(bbto));
|
2020-12-24 00:54:05 +00:00
|
|
|
ASSERT_OK(DestroyDB(kDBPath, options));
|
2018-03-27 03:14:24 +00:00
|
|
|
|
2024-04-27 03:05:30 +00:00
|
|
|
std::unique_ptr<DB> db;
|
|
|
|
{
|
|
|
|
ROCKSDB_NAMESPACE::DB* _db;
|
|
|
|
ASSERT_NOK(ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &_db));
|
|
|
|
|
|
|
|
bbto.block_size = 4096;
|
|
|
|
options.compression = kSnappyCompression;
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(bbto));
|
|
|
|
ASSERT_NOK(ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &_db));
|
|
|
|
|
|
|
|
options.compression = kNoCompression;
|
|
|
|
options.bottommost_compression = kSnappyCompression;
|
|
|
|
ASSERT_NOK(ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &_db));
|
|
|
|
|
|
|
|
options.bottommost_compression = kNoCompression;
|
|
|
|
options.compression_per_level.emplace_back(kSnappyCompression);
|
|
|
|
ASSERT_NOK(ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &_db));
|
|
|
|
|
|
|
|
options.compression_per_level.clear();
|
|
|
|
ASSERT_OK(ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &_db));
|
|
|
|
db.reset(_db);
|
|
|
|
}
|
2018-03-27 03:14:24 +00:00
|
|
|
}
|
|
|
|
|
2018-07-20 21:31:27 +00:00
|
|
|
TEST_F(BBTTailPrefetchTest, TestTailPrefetchStats) {
|
|
|
|
TailPrefetchStats tpstats;
|
|
|
|
ASSERT_EQ(0, tpstats.GetSuggestedPrefetchSize());
|
|
|
|
tpstats.RecordEffectiveSize(size_t{1000});
|
|
|
|
tpstats.RecordEffectiveSize(size_t{1005});
|
|
|
|
tpstats.RecordEffectiveSize(size_t{1002});
|
|
|
|
ASSERT_EQ(1005, tpstats.GetSuggestedPrefetchSize());
|
|
|
|
|
|
|
|
// One single super large value shouldn't influence much
|
|
|
|
tpstats.RecordEffectiveSize(size_t{1002000});
|
|
|
|
tpstats.RecordEffectiveSize(size_t{999});
|
|
|
|
ASSERT_LE(1005, tpstats.GetSuggestedPrefetchSize());
|
|
|
|
ASSERT_GT(1200, tpstats.GetSuggestedPrefetchSize());
|
|
|
|
|
|
|
|
// Only history of 32 is kept
|
|
|
|
for (int i = 0; i < 32; i++) {
|
|
|
|
tpstats.RecordEffectiveSize(size_t{100});
|
|
|
|
}
|
|
|
|
ASSERT_EQ(100, tpstats.GetSuggestedPrefetchSize());
|
|
|
|
|
|
|
|
// 16 large values and 16 small values. The result should be closer
|
|
|
|
// to the small value as the algorithm.
|
|
|
|
for (int i = 0; i < 16; i++) {
|
|
|
|
tpstats.RecordEffectiveSize(size_t{1000});
|
|
|
|
}
|
|
|
|
tpstats.RecordEffectiveSize(size_t{10});
|
|
|
|
tpstats.RecordEffectiveSize(size_t{20});
|
|
|
|
for (int i = 0; i < 6; i++) {
|
|
|
|
tpstats.RecordEffectiveSize(size_t{100});
|
|
|
|
}
|
|
|
|
ASSERT_LE(80, tpstats.GetSuggestedPrefetchSize());
|
|
|
|
ASSERT_GT(200, tpstats.GetSuggestedPrefetchSize());
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(BBTTailPrefetchTest, FilePrefetchBufferMinOffset) {
|
|
|
|
TailPrefetchStats tpstats;
|
2024-01-05 17:29:01 +00:00
|
|
|
FilePrefetchBuffer buffer(ReadaheadParams(), false /* enable */,
|
|
|
|
true /* track_min_offset */);
|
2020-06-29 21:51:57 +00:00
|
|
|
IOOptions opts;
|
2021-11-20 01:52:42 +00:00
|
|
|
buffer.TryReadFromCache(opts, nullptr /* reader */, 500 /* offset */,
|
|
|
|
10 /* n */, nullptr /* result */,
|
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444)
Summary:
**Context/Summary:**
- Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`.
- For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation)
- New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main.
- Misc
- More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority`
- Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444
Test Plan:
- CI fake db crash/stress test
- Microbenchmarking
**Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench`
- google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd
- db_basic_bench_base: upstream
- db_basic_bench_pr: db_basic_bench_base + this PR
- asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read)
- asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR
**Test**
Get
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000
```
Result
```
Coming soon
```
AsyncRead
```
TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out
```
Result
```
Base:
1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008,
PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before):
1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053,
```
Reviewed By: ajkr
Differential Revision: D45918925
Pulled By: hx235
fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
|
|
|
nullptr /* status */);
|
2021-11-20 01:52:42 +00:00
|
|
|
buffer.TryReadFromCache(opts, nullptr /* reader */, 480 /* offset */,
|
|
|
|
10 /* n */, nullptr /* result */,
|
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444)
Summary:
**Context/Summary:**
- Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`.
- For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation)
- New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main.
- Misc
- More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority`
- Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444
Test Plan:
- CI fake db crash/stress test
- Microbenchmarking
**Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench`
- google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd
- db_basic_bench_base: upstream
- db_basic_bench_pr: db_basic_bench_base + this PR
- asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read)
- asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR
**Test**
Get
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000
```
Result
```
Coming soon
```
AsyncRead
```
TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out
```
Result
```
Base:
1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008,
PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before):
1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053,
```
Reviewed By: ajkr
Differential Revision: D45918925
Pulled By: hx235
fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
|
|
|
nullptr /* status */);
|
2021-11-20 01:52:42 +00:00
|
|
|
buffer.TryReadFromCache(opts, nullptr /* reader */, 490 /* offset */,
|
|
|
|
10 /* n */, nullptr /* result */,
|
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444)
Summary:
**Context/Summary:**
- Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`.
- For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation)
- New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main.
- Misc
- More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority`
- Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444
Test Plan:
- CI fake db crash/stress test
- Microbenchmarking
**Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench`
- google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd
- db_basic_bench_base: upstream
- db_basic_bench_pr: db_basic_bench_base + this PR
- asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read)
- asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR
**Test**
Get
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000
```
Result
```
Coming soon
```
AsyncRead
```
TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out
```
Result
```
Base:
1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008,
PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before):
1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053,
```
Reviewed By: ajkr
Differential Revision: D45918925
Pulled By: hx235
fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
|
|
|
nullptr /* status */);
|
2018-07-20 21:31:27 +00:00
|
|
|
ASSERT_EQ(480, buffer.min_offset_read());
|
|
|
|
}
|
|
|
|
|
2018-08-15 21:27:47 +00:00
|
|
|
TEST_P(BlockBasedTableTest, DataBlockHashIndex) {
|
|
|
|
const int kNumKeys = 500;
|
|
|
|
const int kKeySize = 8;
|
|
|
|
const int kValSize = 40;
|
|
|
|
|
|
|
|
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
|
|
|
|
table_options.data_block_index_type =
|
2018-08-17 01:29:13 +00:00
|
|
|
BlockBasedTableOptions::kDataBlockBinaryAndHash;
|
2018-08-15 21:27:47 +00:00
|
|
|
|
|
|
|
Options options;
|
|
|
|
options.comparator = BytewiseComparator();
|
|
|
|
|
|
|
|
options.table_factory.reset(new BlockBasedTableFactory(table_options));
|
|
|
|
|
|
|
|
TableConstructor c(options.comparator);
|
|
|
|
|
|
|
|
static Random rnd(1048);
|
|
|
|
for (int i = 0; i < kNumKeys; i++) {
|
|
|
|
// padding one "0" to mark existent keys.
|
2020-07-09 21:33:42 +00:00
|
|
|
std::string random_key(rnd.RandomString(kKeySize - 1) + "1");
|
2018-08-15 21:27:47 +00:00
|
|
|
InternalKey k(random_key, 0, kTypeValue);
|
2020-07-09 21:33:42 +00:00
|
|
|
c.Add(k.Encode().ToString(), rnd.RandomString(kValSize));
|
2018-08-15 21:27:47 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<std::string> keys;
|
|
|
|
stl_wrappers::KVMap kvmap;
|
2021-05-05 20:59:21 +00:00
|
|
|
const ImmutableOptions ioptions(options);
|
2018-08-15 21:27:47 +00:00
|
|
|
const MutableCFOptions moptions(options);
|
|
|
|
const InternalKeyComparator internal_comparator(options.comparator);
|
|
|
|
c.Finish(options, ioptions, moptions, table_options, internal_comparator,
|
|
|
|
&keys, &kvmap);
|
|
|
|
|
|
|
|
auto reader = c.GetTableReader();
|
|
|
|
|
|
|
|
std::unique_ptr<InternalIterator> seek_iter;
|
2020-08-03 22:21:56 +00:00
|
|
|
ReadOptions read_options;
|
2019-06-20 21:28:22 +00:00
|
|
|
seek_iter.reset(reader->NewIterator(
|
2020-08-03 22:21:56 +00:00
|
|
|
read_options, moptions.prefix_extractor.get(), /*arena=*/nullptr,
|
2019-06-20 21:28:22 +00:00
|
|
|
/*skip_filters=*/false, TableReaderCaller::kUncategorized));
|
2018-08-15 21:27:47 +00:00
|
|
|
for (int i = 0; i < 2; ++i) {
|
|
|
|
ReadOptions ro;
|
|
|
|
// for every kv, we seek using two method: Get() and Seek()
|
|
|
|
// Get() will use the SuffixIndexHash in Block. For non-existent key it
|
|
|
|
// will invalidate the iterator
|
|
|
|
// Seek() will use the default BinarySeek() in Block. So for non-existent
|
|
|
|
// key it will land at the closest key that is large than target.
|
|
|
|
|
|
|
|
// Search for existent keys
|
|
|
|
for (auto& kv : kvmap) {
|
|
|
|
if (i == 0) {
|
|
|
|
// Search using Seek()
|
|
|
|
seek_iter->Seek(kv.first);
|
|
|
|
ASSERT_OK(seek_iter->status());
|
|
|
|
ASSERT_TRUE(seek_iter->Valid());
|
|
|
|
ASSERT_EQ(seek_iter->key(), kv.first);
|
|
|
|
ASSERT_EQ(seek_iter->value(), kv.second);
|
|
|
|
} else {
|
|
|
|
// Search using Get()
|
|
|
|
PinnableSlice value;
|
|
|
|
std::string user_key = ExtractUserKey(kv.first).ToString();
|
|
|
|
GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
|
|
|
|
GetContext::kNotFound, user_key, &value, nullptr,
|
Add support for wide-column point lookups (#10540)
Summary:
The patch adds a new API `GetEntity` that can be used to perform
wide-column point lookups. It also extends the `Get` code path and
the `MemTable` / `MemTableList` and `Version` / `GetContext` logic
accordingly so that wide-column entities can be served from both
memtables and SSTs. If the result of a lookup is a wide-column entity
(`kTypeWideColumnEntity`), it is passed to the application in deserialized
form; if it is a plain old key-value (`kTypeValue`), it is presented as a
wide-column entity with a single default (anonymous) column.
(In contrast, regular `Get` returns plain old key-values as-is, and
returns the value of the default column for wide-column entities, see
https://github.com/facebook/rocksdb/issues/10483 .)
The result of `GetEntity` is a self-contained `PinnableWideColumns` object.
`PinnableWideColumns` contains a `PinnableSlice`, which either stores the
underlying data in its own buffer or holds on to a cache handle. It also contains
a `WideColumns` instance, which indexes the contents of the `PinnableSlice`,
so applications can access the values of columns efficiently.
There are several pieces of functionality which are currently not supported
for wide-column entities: there is currently no `MultiGetEntity` or wide-column
iterator; also, `Merge` and `GetMergeOperands` are not supported, and there
is no `GetEntity` implementation for read-only and secondary instances.
We plan to implement these in future PRs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10540
Test Plan: `make check`
Reviewed By: akankshamahajan15
Differential Revision: D38847474
Pulled By: ltamasi
fbshipit-source-id: 42311a34ccdfe88b3775e847a5e2a5296e002b5b
2022-08-19 18:51:12 +00:00
|
|
|
nullptr, nullptr, true, nullptr, nullptr);
|
2018-08-15 21:27:47 +00:00
|
|
|
ASSERT_OK(reader->Get(ro, kv.first, &get_context,
|
|
|
|
moptions.prefix_extractor.get()));
|
|
|
|
ASSERT_EQ(get_context.State(), GetContext::kFound);
|
|
|
|
ASSERT_EQ(value, Slice(kv.second));
|
|
|
|
value.Reset();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Search for non-existent keys
|
|
|
|
for (auto& kv : kvmap) {
|
|
|
|
std::string user_key = ExtractUserKey(kv.first).ToString();
|
2018-08-17 01:29:13 +00:00
|
|
|
user_key.back() = '0'; // make it non-existent key
|
2018-08-15 21:27:47 +00:00
|
|
|
InternalKey internal_key(user_key, 0, kTypeValue);
|
|
|
|
std::string encoded_key = internal_key.Encode().ToString();
|
2018-08-17 01:29:13 +00:00
|
|
|
if (i == 0) { // Search using Seek()
|
2018-08-15 21:27:47 +00:00
|
|
|
seek_iter->Seek(encoded_key);
|
|
|
|
ASSERT_OK(seek_iter->status());
|
2018-08-17 01:29:13 +00:00
|
|
|
if (seek_iter->Valid()) {
|
2018-08-15 21:27:47 +00:00
|
|
|
ASSERT_TRUE(BytewiseComparator()->Compare(
|
|
|
|
user_key, ExtractUserKey(seek_iter->key())) < 0);
|
|
|
|
}
|
2018-08-17 01:29:13 +00:00
|
|
|
} else { // Search using Get()
|
2018-08-15 21:27:47 +00:00
|
|
|
PinnableSlice value;
|
|
|
|
GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
|
|
|
|
GetContext::kNotFound, user_key, &value, nullptr,
|
Add support for wide-column point lookups (#10540)
Summary:
The patch adds a new API `GetEntity` that can be used to perform
wide-column point lookups. It also extends the `Get` code path and
the `MemTable` / `MemTableList` and `Version` / `GetContext` logic
accordingly so that wide-column entities can be served from both
memtables and SSTs. If the result of a lookup is a wide-column entity
(`kTypeWideColumnEntity`), it is passed to the application in deserialized
form; if it is a plain old key-value (`kTypeValue`), it is presented as a
wide-column entity with a single default (anonymous) column.
(In contrast, regular `Get` returns plain old key-values as-is, and
returns the value of the default column for wide-column entities, see
https://github.com/facebook/rocksdb/issues/10483 .)
The result of `GetEntity` is a self-contained `PinnableWideColumns` object.
`PinnableWideColumns` contains a `PinnableSlice`, which either stores the
underlying data in its own buffer or holds on to a cache handle. It also contains
a `WideColumns` instance, which indexes the contents of the `PinnableSlice`,
so applications can access the values of columns efficiently.
There are several pieces of functionality which are currently not supported
for wide-column entities: there is currently no `MultiGetEntity` or wide-column
iterator; also, `Merge` and `GetMergeOperands` are not supported, and there
is no `GetEntity` implementation for read-only and secondary instances.
We plan to implement these in future PRs.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10540
Test Plan: `make check`
Reviewed By: akankshamahajan15
Differential Revision: D38847474
Pulled By: ltamasi
fbshipit-source-id: 42311a34ccdfe88b3775e847a5e2a5296e002b5b
2022-08-19 18:51:12 +00:00
|
|
|
nullptr, nullptr, true, nullptr, nullptr);
|
2018-08-15 21:27:47 +00:00
|
|
|
ASSERT_OK(reader->Get(ro, encoded_key, &get_context,
|
|
|
|
moptions.prefix_extractor.get()));
|
|
|
|
ASSERT_EQ(get_context.State(), GetContext::kNotFound);
|
|
|
|
value.Reset();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-04-16 18:32:03 +00:00
|
|
|
// BlockBasedTableIterator should invalidate itself and return
|
|
|
|
// OutOfBound()=true immediately after Seek(), to allow LevelIterator
|
|
|
|
// filter out corresponding level.
|
|
|
|
TEST_P(BlockBasedTableTest, OutOfBoundOnSeek) {
|
|
|
|
TableConstructor c(BytewiseComparator(), true /*convert_to_internal_key*/);
|
|
|
|
c.Add("foo", "v1");
|
|
|
|
std::vector<std::string> keys;
|
|
|
|
stl_wrappers::KVMap kvmap;
|
|
|
|
Options options;
|
|
|
|
BlockBasedTableOptions table_opt(GetBlockBasedTableOptions());
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_opt));
|
2021-05-05 20:59:21 +00:00
|
|
|
const ImmutableOptions ioptions(options);
|
2019-04-16 18:32:03 +00:00
|
|
|
const MutableCFOptions moptions(options);
|
|
|
|
c.Finish(options, ioptions, moptions, table_opt,
|
|
|
|
GetPlainInternalComparator(BytewiseComparator()), &keys, &kvmap);
|
|
|
|
auto* reader = c.GetTableReader();
|
|
|
|
ReadOptions read_opt;
|
|
|
|
std::string upper_bound = "bar";
|
|
|
|
Slice upper_bound_slice(upper_bound);
|
|
|
|
read_opt.iterate_upper_bound = &upper_bound_slice;
|
|
|
|
std::unique_ptr<InternalIterator> iter;
|
2019-06-20 21:28:22 +00:00
|
|
|
iter.reset(new KeyConvertingIterator(reader->NewIterator(
|
|
|
|
read_opt, /*prefix_extractor=*/nullptr, /*arena=*/nullptr,
|
|
|
|
/*skip_filters=*/false, TableReaderCaller::kUncategorized)));
|
2019-04-16 18:32:03 +00:00
|
|
|
iter->SeekToFirst();
|
|
|
|
ASSERT_FALSE(iter->Valid());
|
2020-10-08 18:11:52 +00:00
|
|
|
ASSERT_OK(iter->status());
|
2020-08-05 17:42:56 +00:00
|
|
|
ASSERT_TRUE(iter->UpperBoundCheckResult() == IterBoundCheck::kOutOfBound);
|
2019-06-20 21:28:22 +00:00
|
|
|
iter.reset(new KeyConvertingIterator(reader->NewIterator(
|
|
|
|
read_opt, /*prefix_extractor=*/nullptr, /*arena=*/nullptr,
|
|
|
|
/*skip_filters=*/false, TableReaderCaller::kUncategorized)));
|
2019-04-16 18:32:03 +00:00
|
|
|
iter->Seek("foo");
|
|
|
|
ASSERT_FALSE(iter->Valid());
|
2020-10-08 18:11:52 +00:00
|
|
|
ASSERT_OK(iter->status());
|
2020-08-05 17:42:56 +00:00
|
|
|
ASSERT_TRUE(iter->UpperBoundCheckResult() == IterBoundCheck::kOutOfBound);
|
2019-04-16 18:32:03 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// BlockBasedTableIterator should invalidate itself and return
|
|
|
|
// OutOfBound()=true after Next(), if it finds current index key is no smaller
|
|
|
|
// than upper bound, unless it is pointing to the last data block.
|
|
|
|
TEST_P(BlockBasedTableTest, OutOfBoundOnNext) {
|
|
|
|
TableConstructor c(BytewiseComparator(), true /*convert_to_internal_key*/);
|
|
|
|
c.Add("bar", "v");
|
|
|
|
c.Add("foo", "v");
|
|
|
|
std::vector<std::string> keys;
|
|
|
|
stl_wrappers::KVMap kvmap;
|
|
|
|
Options options;
|
|
|
|
BlockBasedTableOptions table_opt(GetBlockBasedTableOptions());
|
|
|
|
table_opt.flush_block_policy_factory =
|
|
|
|
std::make_shared<FlushBlockEveryKeyPolicyFactory>();
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_opt));
|
2021-05-05 20:59:21 +00:00
|
|
|
const ImmutableOptions ioptions(options);
|
2019-04-16 18:32:03 +00:00
|
|
|
const MutableCFOptions moptions(options);
|
|
|
|
c.Finish(options, ioptions, moptions, table_opt,
|
|
|
|
GetPlainInternalComparator(BytewiseComparator()), &keys, &kvmap);
|
|
|
|
auto* reader = c.GetTableReader();
|
|
|
|
ReadOptions read_opt;
|
|
|
|
std::string ub1 = "bar_after";
|
|
|
|
Slice ub_slice1(ub1);
|
|
|
|
read_opt.iterate_upper_bound = &ub_slice1;
|
|
|
|
std::unique_ptr<InternalIterator> iter;
|
2019-06-20 21:28:22 +00:00
|
|
|
iter.reset(new KeyConvertingIterator(reader->NewIterator(
|
|
|
|
read_opt, /*prefix_extractor=*/nullptr, /*arena=*/nullptr,
|
|
|
|
/*skip_filters=*/false, TableReaderCaller::kUncategorized)));
|
2019-04-16 18:32:03 +00:00
|
|
|
iter->Seek("bar");
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ("bar", iter->key());
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_FALSE(iter->Valid());
|
2020-08-05 17:42:56 +00:00
|
|
|
ASSERT_TRUE(iter->UpperBoundCheckResult() == IterBoundCheck::kOutOfBound);
|
2019-04-16 18:32:03 +00:00
|
|
|
std::string ub2 = "foo_after";
|
|
|
|
Slice ub_slice2(ub2);
|
|
|
|
read_opt.iterate_upper_bound = &ub_slice2;
|
2019-06-20 21:28:22 +00:00
|
|
|
iter.reset(new KeyConvertingIterator(reader->NewIterator(
|
|
|
|
read_opt, /*prefix_extractor=*/nullptr, /*arena=*/nullptr,
|
|
|
|
/*skip_filters=*/false, TableReaderCaller::kUncategorized)));
|
2019-04-16 18:32:03 +00:00
|
|
|
iter->Seek("foo");
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ("foo", iter->key());
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_FALSE(iter->Valid());
|
2020-08-05 17:42:56 +00:00
|
|
|
ASSERT_FALSE(iter->UpperBoundCheckResult() == IterBoundCheck::kOutOfBound);
|
2019-04-16 18:32:03 +00:00
|
|
|
}
|
|
|
|
|
2022-05-17 22:01:51 +00:00
|
|
|
class ChargeCompressionDictionaryBuildingBufferTest
|
|
|
|
: public BlockBasedTableTestBase {};
|
|
|
|
TEST_F(ChargeCompressionDictionaryBuildingBufferTest, Basic) {
|
2021-09-08 19:34:35 +00:00
|
|
|
constexpr std::size_t kSizeDummyEntry = 256 * 1024;
|
|
|
|
constexpr std::size_t kMetaDataChargeOverhead = 10000;
|
|
|
|
constexpr std::size_t kCacheCapacity = 8 * 1024 * 1024;
|
|
|
|
constexpr std::size_t kMaxDictBytes = 1024;
|
|
|
|
constexpr std::size_t kMaxDictBufferBytes = 1024;
|
|
|
|
|
2022-05-17 22:01:51 +00:00
|
|
|
for (CacheEntryRoleOptions::Decision
|
|
|
|
charge_compression_dictionary_building_buffer :
|
|
|
|
{CacheEntryRoleOptions::Decision::kEnabled,
|
|
|
|
CacheEntryRoleOptions::Decision::kDisabled}) {
|
|
|
|
BlockBasedTableOptions table_options;
|
|
|
|
LRUCacheOptions lo;
|
|
|
|
lo.capacity = kCacheCapacity;
|
|
|
|
lo.num_shard_bits = 0; // 2^0 shard
|
|
|
|
lo.strict_capacity_limit = true;
|
|
|
|
std::shared_ptr<Cache> cache(NewLRUCache(lo));
|
|
|
|
table_options.block_cache = cache;
|
|
|
|
table_options.flush_block_policy_factory =
|
|
|
|
std::make_shared<FlushBlockEveryKeyPolicyFactory>();
|
|
|
|
table_options.cache_usage_options.options_overrides.insert(
|
|
|
|
{CacheEntryRole::kCompressionDictionaryBuildingBuffer,
|
|
|
|
{/*.charged = */ charge_compression_dictionary_building_buffer}});
|
|
|
|
Options options;
|
|
|
|
options.compression = kSnappyCompression;
|
|
|
|
options.compression_opts.max_dict_bytes = kMaxDictBytes;
|
|
|
|
options.compression_opts.max_dict_buffer_bytes = kMaxDictBufferBytes;
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
2021-09-08 19:34:35 +00:00
|
|
|
|
2022-05-17 22:01:51 +00:00
|
|
|
test::StringSink* sink = new test::StringSink();
|
|
|
|
std::unique_ptr<FSWritableFile> holder(sink);
|
|
|
|
std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
|
|
|
|
std::move(holder), "test_file_name", FileOptions()));
|
2021-09-08 19:34:35 +00:00
|
|
|
|
2022-05-17 22:01:51 +00:00
|
|
|
ImmutableOptions ioptions(options);
|
|
|
|
MutableCFOptions moptions(options);
|
|
|
|
InternalKeyComparator ikc(options.comparator);
|
2024-02-02 22:14:43 +00:00
|
|
|
InternalTblPropCollFactories internal_tbl_prop_coll_factories;
|
2021-09-08 19:34:35 +00:00
|
|
|
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
const ReadOptions read_options;
|
|
|
|
const WriteOptions write_options;
|
2022-05-17 22:01:51 +00:00
|
|
|
std::unique_ptr<TableBuilder> builder(
|
|
|
|
options.table_factory->NewTableBuilder(
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
TableBuilderOptions(ioptions, moptions, read_options, write_options,
|
2024-02-02 22:14:43 +00:00
|
|
|
ikc, &internal_tbl_prop_coll_factories,
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
kSnappyCompression, options.compression_opts,
|
2024-11-01 17:08:35 +00:00
|
|
|
kUnknownColumnFamily, "test_cf", -1 /* level */,
|
|
|
|
kUnknownNewestKeyTime),
|
2022-05-17 22:01:51 +00:00
|
|
|
file_writer.get()));
|
|
|
|
|
|
|
|
std::string key1 = "key1";
|
|
|
|
std::string value1 = "val1";
|
|
|
|
InternalKey ik1(key1, 0 /* sequnce number */, kTypeValue);
|
|
|
|
// Adding the first key won't trigger a flush by FlushBlockEveryKeyPolicy
|
|
|
|
// therefore won't trigger any data block's buffering
|
|
|
|
builder->Add(ik1.Encode(), value1);
|
|
|
|
ASSERT_EQ(cache->GetPinnedUsage(), 0 * kSizeDummyEntry);
|
|
|
|
|
|
|
|
std::string key2 = "key2";
|
|
|
|
std::string value2 = "val2";
|
|
|
|
InternalKey ik2(key2, 1 /* sequnce number */, kTypeValue);
|
|
|
|
// Adding the second key will trigger a flush of the last data block (the
|
|
|
|
// one containing key1 and value1) by FlushBlockEveryKeyPolicy and hence
|
|
|
|
// trigger buffering of that data block.
|
|
|
|
builder->Add(ik2.Encode(), value2);
|
|
|
|
// Cache charging will increase for last buffered data block (the one
|
|
|
|
// containing key1 and value1) since the buffer limit is not exceeded after
|
|
|
|
// that buffering and the cache will not be full after this reservation
|
|
|
|
if (charge_compression_dictionary_building_buffer ==
|
|
|
|
CacheEntryRoleOptions::Decision::kEnabled) {
|
|
|
|
EXPECT_GE(cache->GetPinnedUsage(), 1 * kSizeDummyEntry);
|
|
|
|
EXPECT_LT(cache->GetPinnedUsage(),
|
|
|
|
1 * kSizeDummyEntry + kMetaDataChargeOverhead);
|
|
|
|
} else {
|
|
|
|
EXPECT_EQ(cache->GetPinnedUsage(), 0 * kSizeDummyEntry);
|
|
|
|
}
|
2021-09-08 19:34:35 +00:00
|
|
|
|
2022-05-17 22:01:51 +00:00
|
|
|
ASSERT_OK(builder->Finish());
|
|
|
|
EXPECT_EQ(cache->GetPinnedUsage(), 0 * kSizeDummyEntry);
|
|
|
|
}
|
2021-09-08 19:34:35 +00:00
|
|
|
}
|
|
|
|
|
2022-05-17 22:01:51 +00:00
|
|
|
TEST_F(ChargeCompressionDictionaryBuildingBufferTest,
|
|
|
|
BasicWithBufferLimitExceed) {
|
2021-09-08 19:34:35 +00:00
|
|
|
constexpr std::size_t kSizeDummyEntry = 256 * 1024;
|
|
|
|
constexpr std::size_t kMetaDataChargeOverhead = 10000;
|
|
|
|
constexpr std::size_t kCacheCapacity = 8 * 1024 * 1024;
|
|
|
|
constexpr std::size_t kMaxDictBytes = 1024;
|
|
|
|
constexpr std::size_t kMaxDictBufferBytes = 2 * kSizeDummyEntry;
|
|
|
|
|
2022-05-17 22:01:51 +00:00
|
|
|
// `CacheEntryRoleOptions::charged` is enabled by default for
|
|
|
|
// CacheEntryRole::kCompressionDictionaryBuildingBuffer
|
|
|
|
BlockBasedTableOptions table_options;
|
2021-09-08 19:34:35 +00:00
|
|
|
LRUCacheOptions lo;
|
|
|
|
lo.capacity = kCacheCapacity;
|
|
|
|
lo.num_shard_bits = 0; // 2^0 shard
|
|
|
|
lo.strict_capacity_limit = true;
|
|
|
|
std::shared_ptr<Cache> cache(NewLRUCache(lo));
|
|
|
|
table_options.block_cache = cache;
|
|
|
|
table_options.flush_block_policy_factory =
|
|
|
|
std::make_shared<FlushBlockEveryKeyPolicyFactory>();
|
|
|
|
|
|
|
|
Options options;
|
|
|
|
options.compression = kSnappyCompression;
|
|
|
|
options.compression_opts.max_dict_bytes = kMaxDictBytes;
|
|
|
|
options.compression_opts.max_dict_buffer_bytes = kMaxDictBufferBytes;
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
|
|
|
|
test::StringSink* sink = new test::StringSink();
|
|
|
|
std::unique_ptr<FSWritableFile> holder(sink);
|
|
|
|
std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
|
|
|
|
std::move(holder), "test_file_name", FileOptions()));
|
|
|
|
|
|
|
|
ImmutableOptions ioptions(options);
|
|
|
|
MutableCFOptions moptions(options);
|
|
|
|
InternalKeyComparator ikc(options.comparator);
|
2024-02-02 22:14:43 +00:00
|
|
|
InternalTblPropCollFactories internal_tbl_prop_coll_factories;
|
2021-09-08 19:34:35 +00:00
|
|
|
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
const ReadOptions read_options;
|
|
|
|
const WriteOptions write_options;
|
2021-09-08 19:34:35 +00:00
|
|
|
std::unique_ptr<TableBuilder> builder(options.table_factory->NewTableBuilder(
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
TableBuilderOptions(ioptions, moptions, read_options, write_options, ikc,
|
2024-02-02 22:14:43 +00:00
|
|
|
&internal_tbl_prop_coll_factories, kSnappyCompression,
|
2021-09-08 19:34:35 +00:00
|
|
|
options.compression_opts, kUnknownColumnFamily,
|
2024-11-01 17:08:35 +00:00
|
|
|
"test_cf", -1 /* level */, kUnknownNewestKeyTime),
|
2021-09-08 19:34:35 +00:00
|
|
|
file_writer.get()));
|
|
|
|
|
|
|
|
std::string key1 = "key1";
|
|
|
|
std::string value1(kSizeDummyEntry, '0');
|
|
|
|
InternalKey ik1(key1, 0 /* sequnce number */, kTypeValue);
|
|
|
|
// Adding the first key won't trigger a flush by FlushBlockEveryKeyPolicy
|
|
|
|
// therefore won't trigger any data block's buffering
|
|
|
|
builder->Add(ik1.Encode(), value1);
|
|
|
|
ASSERT_EQ(cache->GetPinnedUsage(), 0 * kSizeDummyEntry);
|
|
|
|
|
|
|
|
std::string key2 = "key2";
|
|
|
|
std::string value2(kSizeDummyEntry, '0');
|
|
|
|
InternalKey ik2(key2, 1 /* sequnce number */, kTypeValue);
|
|
|
|
// Adding the second key will trigger a flush of the last data block (the one
|
|
|
|
// containing key1 and value1) by FlushBlockEveryKeyPolicy and hence trigger
|
|
|
|
// buffering of the last data block.
|
|
|
|
builder->Add(ik2.Encode(), value2);
|
2022-05-17 22:01:51 +00:00
|
|
|
// Cache charging will increase for last buffered data block (the one
|
2021-09-08 19:34:35 +00:00
|
|
|
// containing key1 and value1) since the buffer limit is not exceeded after
|
|
|
|
// the buffering and the cache will not be full after this reservation
|
|
|
|
EXPECT_GE(cache->GetPinnedUsage(), 2 * kSizeDummyEntry);
|
|
|
|
EXPECT_LT(cache->GetPinnedUsage(),
|
|
|
|
2 * kSizeDummyEntry + kMetaDataChargeOverhead);
|
|
|
|
|
|
|
|
std::string key3 = "key3";
|
|
|
|
std::string value3 = "val3";
|
|
|
|
InternalKey ik3(key3, 2 /* sequnce number */, kTypeValue);
|
|
|
|
// Adding the third key will trigger a flush of the last data block (the one
|
|
|
|
// containing key2 and value2) by FlushBlockEveryKeyPolicy and hence trigger
|
|
|
|
// buffering of the last data block.
|
|
|
|
builder->Add(ik3.Encode(), value3);
|
2022-05-17 22:01:51 +00:00
|
|
|
// Cache charging will decrease since the buffer limit is now exceeded
|
2021-09-08 19:34:35 +00:00
|
|
|
// after the last buffering and EnterUnbuffered() is triggered
|
|
|
|
EXPECT_EQ(cache->GetPinnedUsage(), 0 * kSizeDummyEntry);
|
|
|
|
|
|
|
|
ASSERT_OK(builder->Finish());
|
|
|
|
EXPECT_EQ(cache->GetPinnedUsage(), 0 * kSizeDummyEntry);
|
|
|
|
}
|
|
|
|
|
2022-05-17 22:01:51 +00:00
|
|
|
TEST_F(ChargeCompressionDictionaryBuildingBufferTest, BasicWithCacheFull) {
|
2021-09-08 19:34:35 +00:00
|
|
|
constexpr std::size_t kSizeDummyEntry = 256 * 1024;
|
|
|
|
constexpr std::size_t kMetaDataChargeOverhead = 10000;
|
2022-05-17 22:01:51 +00:00
|
|
|
// A small kCacheCapacity is chosen so that increase cache charging for
|
2021-09-08 19:34:35 +00:00
|
|
|
// buffering two data blocks, each containing key1/value1, key2/a big
|
|
|
|
// value2, will cause cache full
|
|
|
|
constexpr std::size_t kCacheCapacity =
|
|
|
|
1 * kSizeDummyEntry + kSizeDummyEntry / 2;
|
|
|
|
constexpr std::size_t kMaxDictBytes = 1024;
|
|
|
|
// A big kMaxDictBufferBytes is chosen so that adding a big key value pair
|
|
|
|
// (key2, value2) won't exceed the buffer limit
|
|
|
|
constexpr std::size_t kMaxDictBufferBytes = 1024 * 1024 * 1024;
|
|
|
|
|
2022-05-17 22:01:51 +00:00
|
|
|
// `CacheEntryRoleOptions::charged` is enabled by default for
|
|
|
|
// CacheEntryRole::kCompressionDictionaryBuildingBuffer
|
|
|
|
BlockBasedTableOptions table_options;
|
2021-09-08 19:34:35 +00:00
|
|
|
LRUCacheOptions lo;
|
|
|
|
lo.capacity = kCacheCapacity;
|
|
|
|
lo.num_shard_bits = 0; // 2^0 shard
|
|
|
|
lo.strict_capacity_limit = true;
|
|
|
|
std::shared_ptr<Cache> cache(NewLRUCache(lo));
|
|
|
|
table_options.block_cache = cache;
|
|
|
|
table_options.flush_block_policy_factory =
|
|
|
|
std::make_shared<FlushBlockEveryKeyPolicyFactory>();
|
|
|
|
|
|
|
|
Options options;
|
|
|
|
options.compression = kSnappyCompression;
|
|
|
|
options.compression_opts.max_dict_bytes = kMaxDictBytes;
|
|
|
|
options.compression_opts.max_dict_buffer_bytes = kMaxDictBufferBytes;
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
|
|
|
|
test::StringSink* sink = new test::StringSink();
|
|
|
|
std::unique_ptr<FSWritableFile> holder(sink);
|
|
|
|
std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
|
|
|
|
std::move(holder), "test_file_name", FileOptions()));
|
|
|
|
|
|
|
|
ImmutableOptions ioptions(options);
|
|
|
|
MutableCFOptions moptions(options);
|
|
|
|
InternalKeyComparator ikc(options.comparator);
|
2024-02-02 22:14:43 +00:00
|
|
|
InternalTblPropCollFactories internal_tbl_prop_coll_factories;
|
2021-09-08 19:34:35 +00:00
|
|
|
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
const ReadOptions read_options;
|
|
|
|
const WriteOptions write_options;
|
2021-09-08 19:34:35 +00:00
|
|
|
std::unique_ptr<TableBuilder> builder(options.table_factory->NewTableBuilder(
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
TableBuilderOptions(ioptions, moptions, read_options, write_options, ikc,
|
2024-02-02 22:14:43 +00:00
|
|
|
&internal_tbl_prop_coll_factories, kSnappyCompression,
|
2021-09-08 19:34:35 +00:00
|
|
|
options.compression_opts, kUnknownColumnFamily,
|
2024-11-01 17:08:35 +00:00
|
|
|
"test_cf", -1 /* level */, kUnknownNewestKeyTime),
|
2021-09-08 19:34:35 +00:00
|
|
|
file_writer.get()));
|
|
|
|
|
|
|
|
std::string key1 = "key1";
|
|
|
|
std::string value1 = "val1";
|
|
|
|
InternalKey ik1(key1, 0 /* sequnce number */, kTypeValue);
|
|
|
|
// Adding the first key won't trigger a flush by FlushBlockEveryKeyPolicy
|
|
|
|
// therefore won't trigger any data block's buffering
|
|
|
|
builder->Add(ik1.Encode(), value1);
|
|
|
|
ASSERT_EQ(cache->GetPinnedUsage(), 0 * kSizeDummyEntry);
|
|
|
|
|
|
|
|
std::string key2 = "key2";
|
|
|
|
std::string value2(kSizeDummyEntry, '0');
|
|
|
|
InternalKey ik2(key2, 1 /* sequnce number */, kTypeValue);
|
|
|
|
// Adding the second key will trigger a flush of the last data block (the one
|
|
|
|
// containing key1 and value1) by FlushBlockEveryKeyPolicy and hence trigger
|
|
|
|
// buffering of the last data block.
|
|
|
|
builder->Add(ik2.Encode(), value2);
|
2022-05-17 22:01:51 +00:00
|
|
|
// Cache charging will increase for the last buffered data block (the one
|
2021-09-08 19:34:35 +00:00
|
|
|
// containing key1 and value1) since the buffer limit is not exceeded after
|
|
|
|
// the buffering and the cache will not be full after this reservation
|
|
|
|
EXPECT_GE(cache->GetPinnedUsage(), 1 * kSizeDummyEntry);
|
|
|
|
EXPECT_LT(cache->GetPinnedUsage(),
|
|
|
|
1 * kSizeDummyEntry + kMetaDataChargeOverhead);
|
|
|
|
|
|
|
|
std::string key3 = "key3";
|
|
|
|
std::string value3 = "value3";
|
|
|
|
InternalKey ik3(key3, 2 /* sequnce number */, kTypeValue);
|
|
|
|
// Adding the third key will trigger a flush of the last data block (the one
|
|
|
|
// containing key2 and value2) by FlushBlockEveryKeyPolicy and hence trigger
|
|
|
|
// buffering of the last data block.
|
|
|
|
builder->Add(ik3.Encode(), value3);
|
2022-05-17 22:01:51 +00:00
|
|
|
// Cache charging will decrease since the cache is now full after
|
2021-09-08 19:34:35 +00:00
|
|
|
// increasing reservation for the last buffered block and EnterUnbuffered() is
|
|
|
|
// triggered
|
|
|
|
EXPECT_EQ(cache->GetPinnedUsage(), 0 * kSizeDummyEntry);
|
|
|
|
|
|
|
|
ASSERT_OK(builder->Finish());
|
|
|
|
EXPECT_EQ(cache->GetPinnedUsage(), 0 * kSizeDummyEntry);
|
|
|
|
}
|
|
|
|
|
2022-05-17 22:01:51 +00:00
|
|
|
class CacheUsageOptionsOverridesTest : public DBTestBase {
|
|
|
|
public:
|
|
|
|
CacheUsageOptionsOverridesTest()
|
|
|
|
: DBTestBase("cache_usage_options_overrides_test",
|
|
|
|
/*env_do_fsync=*/false) {}
|
|
|
|
};
|
|
|
|
|
|
|
|
TEST_F(CacheUsageOptionsOverridesTest, SanitizeAndValidateOptions) {
|
|
|
|
// To test `cache_usage_options.options_overrides` is sanitized
|
|
|
|
// where `cache_usage_options.options` is used when there is no entry in
|
|
|
|
// `cache_usage_options.options_overrides`
|
|
|
|
Options options;
|
|
|
|
options.create_if_missing = true;
|
|
|
|
BlockBasedTableOptions table_options = BlockBasedTableOptions();
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
Destroy(options);
|
|
|
|
Status s = TryReopen(options);
|
|
|
|
EXPECT_TRUE(s.ok());
|
|
|
|
const auto* sanitized_table_options =
|
|
|
|
options.table_factory->GetOptions<BlockBasedTableOptions>();
|
|
|
|
const auto sanitized_options_overrides =
|
|
|
|
sanitized_table_options->cache_usage_options.options_overrides;
|
|
|
|
EXPECT_EQ(sanitized_options_overrides.size(), kNumCacheEntryRoles);
|
|
|
|
for (auto options_overrides_iter = sanitized_options_overrides.cbegin();
|
|
|
|
options_overrides_iter != sanitized_options_overrides.cend();
|
|
|
|
++options_overrides_iter) {
|
|
|
|
CacheEntryRoleOptions role_options = options_overrides_iter->second;
|
|
|
|
CacheEntryRoleOptions default_options =
|
|
|
|
sanitized_table_options->cache_usage_options.options;
|
|
|
|
EXPECT_TRUE(role_options == default_options);
|
|
|
|
}
|
|
|
|
Destroy(options);
|
|
|
|
|
|
|
|
// To test option validation on unsupported CacheEntryRole
|
|
|
|
table_options = BlockBasedTableOptions();
|
|
|
|
table_options.cache_usage_options.options_overrides.insert(
|
|
|
|
{CacheEntryRole::kDataBlock,
|
|
|
|
{/*.charged = */ CacheEntryRoleOptions::Decision::kDisabled}});
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
Destroy(options);
|
|
|
|
s = TryReopen(options);
|
|
|
|
EXPECT_TRUE(s.IsNotSupported());
|
|
|
|
EXPECT_TRUE(
|
|
|
|
s.ToString().find("Enable/Disable CacheEntryRoleOptions::charged") !=
|
|
|
|
std::string::npos);
|
|
|
|
EXPECT_TRUE(
|
|
|
|
s.ToString().find(kCacheEntryRoleToCamelString[static_cast<uint32_t>(
|
|
|
|
CacheEntryRole::kDataBlock)]) != std::string::npos);
|
|
|
|
Destroy(options);
|
|
|
|
|
|
|
|
// To test option validation on existence of block cache
|
|
|
|
table_options = BlockBasedTableOptions();
|
|
|
|
table_options.no_block_cache = true;
|
|
|
|
table_options.cache_usage_options.options_overrides.insert(
|
|
|
|
{CacheEntryRole::kFilterConstruction,
|
|
|
|
{/*.charged = */ CacheEntryRoleOptions::Decision::kEnabled}});
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
Destroy(options);
|
|
|
|
s = TryReopen(options);
|
|
|
|
EXPECT_TRUE(s.IsInvalidArgument());
|
|
|
|
EXPECT_TRUE(s.ToString().find("Enable CacheEntryRoleOptions::charged") !=
|
|
|
|
std::string::npos);
|
|
|
|
EXPECT_TRUE(
|
|
|
|
s.ToString().find(kCacheEntryRoleToCamelString[static_cast<std::size_t>(
|
|
|
|
CacheEntryRole::kFilterConstruction)]) != std::string::npos);
|
|
|
|
EXPECT_TRUE(s.ToString().find("block cache is disabled") !=
|
|
|
|
std::string::npos);
|
|
|
|
Destroy(options);
|
|
|
|
}
|
2020-02-20 20:07:53 +00:00
|
|
|
} // namespace ROCKSDB_NAMESPACE
|
2011-03-18 22:37:00 +00:00
|
|
|
|
|
|
|
int main(int argc, char** argv) {
|
2021-12-10 16:12:09 +00:00
|
|
|
ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
|
2015-03-17 21:08:00 +00:00
|
|
|
::testing::InitGoogleTest(&argc, argv);
|
|
|
|
return RUN_ALL_TESTS();
|
2011-03-18 22:37:00 +00:00
|
|
|
}
|